From 9bbf68f0300c7483e0c4ca1f21373a88de0b387b Mon Sep 17 00:00:00 2001 From: Jeremy Date: Mon, 24 Nov 2025 14:37:29 -0800 Subject: [PATCH 01/13] All modularized cNF code added/updated --- 02_normalize_batchcorrect_omics.R | 768 +++++++++++++++++++++ 02_run_normalize_omics.Rmd | 165 +++++ 03_analyze_modality_correlations.R | 279 ++++++++ 04_analyze_modality_and_pathway_enrich.Rmd | 114 +++ 04_leapr_biomarker.R | 467 +++++++++++++ cNF_helper_code.R | 1 + 6 files changed, 1794 insertions(+) create mode 100644 02_normalize_batchcorrect_omics.R create mode 100644 02_run_normalize_omics.Rmd create mode 100644 03_analyze_modality_correlations.R create mode 100644 04_analyze_modality_and_pathway_enrich.Rmd create mode 100644 04_leapr_biomarker.R diff --git a/02_normalize_batchcorrect_omics.R b/02_normalize_batchcorrect_omics.R new file mode 100644 index 0000000..a5dad15 --- /dev/null +++ b/02_normalize_batchcorrect_omics.R @@ -0,0 +1,768 @@ +# ============================================================================= +# normalize_omics_pipeline.R +# Arguments: +# - write_outputs: master on/off for writing CSVs, PDFs, and uploading +# - save_basename: override base name used in filenames +# - do_batch_correct: skip ComBat when FALSE; use combined matrix instead +# ============================================================================= + +suppressPackageStartupMessages({ + library(dplyr) + library(tidyr) + library(stringr) + library(SummarizedExperiment) + library(ggplot2) + library(readr) + library(rlang) +}) + +# Small helpers + +modified_zscore <- function(x, na.rm = TRUE) { + m <- suppressWarnings(stats::median(x, na.rm = na.rm)) + md <- suppressWarnings(stats::mad(x, constant = 1, na.rm = na.rm)) + if (is.na(md) || md == 0) return(rep(0, length(x))) + 0.6745 * (x - m) / md +} + +filter_by_missingness <- function(mat) { + keep <- apply(mat, 1, function(r) mean(is.na(r)) <= 0.5) + mat[keep, , drop = FALSE] +} + +union_rows_fill_NA <- function(mats) { + all_feats <- Reduce(union, lapply(mats, rownames)) + lapply(mats, function(m) { + mm <- matrix(NA_real_, nrow = length(all_feats), ncol = ncol(m), + dimnames = list(all_feats, colnames(m))) + mm[rownames(m), colnames(m)] <- m + mm + }) +} + +collapse_duplicate_features <- function(mat) { + if (!any(duplicated(rownames(mat)))) return(mat) + grp <- split(seq_len(nrow(mat)), rownames(mat)) + collapsed <- do.call(rbind, lapply(grp, function(ix) colSums(mat[ix, , drop = FALSE], na.rm = TRUE))) + rownames(collapsed) <- names(grp) + collapsed +} + +make_dropper <- function(substrings) { + if (is.null(substrings) || length(substrings) == 0) return(function(x) rep(FALSE, length(x))) + pattern <- paste0(substrings, collapse = "|") + function(x) grepl(pattern, x, fixed = FALSE) +} + +# Functions to clean up irregular names + +basename_only <- function(x) sub("^.*[\\\\/]", "", x) +basename_no_ext <- function(x) sub("\\.[^.]+$", "", basename_only(x)) + +normalize_specimen_like <- function(x) { + y <- tolower(x) + y <- gsub("\\s+", "", y) + y <- gsub("\\.", "-", y) + y <- gsub("_", "-", y) + y <- gsub("organoids?$", "organoid", y) + y <- gsub("-organoids?-", "-organoid-", y) + y <- gsub("skin$", "skin", y) + y <- gsub("tissues?$", "tissue", y) + y <- gsub("--+", "-", y) + y <- gsub("^-|-$", "", y) + y +} + +parse_rna_header_triplet <- function(fnames) { + toks_list <- strsplit(fnames, "\\.") + out <- lapply(seq_along(toks_list), function(i) { + toks <- toks_list[[i]] + sample_id <- if (length(toks) >= 1) toks[[1]] else NA_character_ + + tumor <- NA_character_ + condition_raw <- NA_character_ + + if (length(toks) >= 2) { + if (grepl("^T\\d+$", toks[[2]], ignore.case = TRUE)) { + tumor <- toupper(toks[[2]]) + cond_tokens <- toks[-c(1,2)] + condition_raw <- if (length(cond_tokens)) paste(cond_tokens, collapse = ".") else NA_character_ + } else { + cond_tokens <- toks[-1] + condition_raw <- if (length(cond_tokens)) paste(cond_tokens, collapse = ".") else NA_character_ + } + } + + condition_norm <- condition_raw + if (!is.na(condition_norm)) { + low <- tolower(condition_norm) + if (grepl("^organoids?$", low)) condition_norm <- "organoid" + else if (grepl("^tissues?$", low)) condition_norm <- "tissue" + else if (grepl("^skin$", low)) condition_norm <- "skin" + } + + data.frame( + fname = fnames[i], + sample_id = sample_id, + tumor = ifelse(is.na(tumor), NA_character_, toupper(tumor)), + condition_raw = condition_raw, + condition_norm= condition_norm, + stringsAsFactors = FALSE + ) + }) + df <- do.call(rbind, out) + df$specimen_norm <- with(df, { + sid <- sample_id + tmr <- ifelse(is.na(tumor) | tumor == "", "", paste0("_", tumor)) + cnd <- ifelse(is.na(condition_norm) | condition_norm == "", "", paste0("_", condition_norm)) + paste0(sid, tmr, cnd) + }) + df +} + +# Functions to get data from Synapse + +read_wide_from_synapse <- function(syn, syn_id) { + message(" Reading Synapse file: ", syn_id) + df <- read.table( + syn$get(syn_id)$path, + sep = "\t", header = TRUE, quote = '"', + fill = TRUE, check.names = FALSE + ) + message(sprintf(" - Read %d rows × %d cols; first cols: %s", + nrow(df), ncol(df), paste(head(colnames(df), 8), collapse = ", "))) + df +} + +detect_value_start_col <- function(wide_df, fallback = 5) { + nms <- colnames(wide_df) + is_pathy <- grepl("\\.(raw|mzml)$", nms, ignore.case = TRUE) | + grepl("[/\\\\]", nms) | + grepl("^[A-Za-z]:\\\\", nms) + if (any(is_pathy)) { + i <- which(is_pathy)[1] + message(" Auto-detected first sample column at index ", i, " to '", nms[i], "'") + return(i) + } + message(" Did not detect path/RAW headers; using fallback value_start_col=", fallback) + fallback +} + +parse_fnames <- function(fnames, aliquot_field_index, cohort) { + message("Parsing filenames to (fname, aliquot, cohort)") + rows <- lapply(fnames, function(fname) { + toks <- strsplit(fname, "_", fixed = TRUE)[[1]] + aliq <- NA_real_ + + if (!is.null(aliquot_field_index) && + aliquot_field_index >= 1 && + aliquot_field_index <= length(toks)) { + aliq_try <- suppressWarnings(as.double(toks[[aliquot_field_index]])) + if (!is.na(aliq_try)) aliq <- aliq_try + } + + if (is.na(aliq)) { + num_tokens <- suppressWarnings(as.double(toks)) + if (any(!is.na(num_tokens))) aliq <- tail(num_tokens[!is.na(num_tokens)], 1) + } + + data.frame( + fname = fname, + aliquot = aliq, + cohort = cohort, + stringsAsFactors = FALSE + ) + }) + out <- do.call(rbind, rows) + message(sprintf(" - Parsed %d samples (aliquot NA: %d)", + nrow(out), sum(is.na(out$aliquot)))) + out +} + +# -------------------------- Feature ID builders ------------------------------ + +build_phospho_ids <- function(df) { + lsite <- tolower(df$Residue) + paste0(df$`Gene.Names`, "-", df$Residue, df$Site, lsite) +} +build_global_ids <- function(df) as.character(df$Genes) +build_rna_ids <- function(df) { + cand <- c("gene_id","Gene","gene","gene_name","Symbol","symbol","ENSEMBL","Ensembl","ensembl_gene_id") + hit <- cand[cand %in% names(df)] + if (length(hit) == 0) stop("RNA feature-id column not found.") + if ("gene_id" %in% hit) return(as.character(df[["gene_id"]])) + if ("gene_name" %in% hit) return(as.character(df[["gene_name"]])) + as.character(df[[hit[[1]]]]) +} +pick_builder <- function(modality) { + m <- tolower(modality) + if (m == "phospho") return(build_phospho_ids) + if (m == "global") return(build_global_ids) + if (m == "rna") return(build_rna_ids) + stop("Unknown modality: ", modality) +} + +# Functions to Normalize Data. Uses SummarizedExperiment + +coldata_tbl <- function(se) { + cd <- as.data.frame(SummarizedExperiment::colData(se), stringsAsFactors = FALSE) + if ("fname" %in% names(cd)) names(cd)[names(cd) == "fname"] <- ".coldata_fname" + names(cd) <- make.unique(names(cd), sep = "_") + cd$fname <- rownames(cd) + cd$fname_base <- basename_only(cd$fname) + cd$fname_stem <- basename_no_ext(cd$fname) + cd <- cd[, c("fname","fname_base","fname_stem", setdiff(names(cd), c("fname","fname_base","fname_stem"))), drop = FALSE] + cd +} + +looks_like_sample_header <- function(x) { + grepl("\\.(raw|mzml)$", x, ignore.case = TRUE) | + grepl("[/\\\\]", x) | + grepl("^[A-Za-z]:\\\\", x) +} + +make_se <- function(wide_df, value_start_col, feature_ids, fnames_df, meta, drop_name, modality) { + all_candidate <- colnames(wide_df)[value_start_col:ncol(wide_df)] + has_pathy <- any(looks_like_sample_header(all_candidate)) + if (has_pathy) { + sample_cols <- all_candidate[looks_like_sample_header(all_candidate)] + sample_cols <- setdiff(sample_cols, c("Site", "Sequence")) + } else { + sample_cols <- setdiff(all_candidate, c("Site", "Sequence")) + } + + message(" Candidate sample columns (first 6):") + print(utils::head(sample_cols, 6)) + + message(" Casting measurement block to numeric") + raw_block <- wide_df[, sample_cols, drop = FALSE] + clean_block <- as.data.frame( + lapply(raw_block, function(col) { + if (is.factor(col)) col <- as.character(col) + col[col %in% c("", "NA", "NaN", "na", "n/a", "NULL")] <- NA + suppressWarnings(as.numeric(col)) + }), + check.names = FALSE + ) + + keep <- !drop_name(colnames(clean_block)) + if (any(!keep)) { + message(" Dropping unwanted sample columns by pattern: ", sum(!keep)) + clean_block <- clean_block[, keep, drop = FALSE] + sample_cols <- sample_cols[keep] + } + + mat <- as.matrix(clean_block) + rownames(mat) <- feature_ids + colnames(mat) <- sample_cols + + fnames_df <- fnames_df %>% dplyr::semi_join(data.frame(fname = sample_cols), by = "fname") + + parsed_df <- if (tolower(modality) == "rna") parse_rna_header_triplet(fnames_df$fname) else + data.frame(fname = fnames_df$fname, stringsAsFactors = FALSE) + + message(" Joining sample meta (by aliquot & cohort)") + meta_join <- meta + if ("fname" %in% names(meta_join)) { + message(" ! Debug: Dropping 'fname' column from 'meta' to prevent duplication") + meta_join <- dplyr::select(meta_join, -fname) + } + + cdata <- fnames_df %>% + dplyr::left_join(meta_join, by = c("aliquot","cohort")) %>% + dplyr::left_join(parsed_df, by = "fname") %>% + dplyr::mutate(cohort = as.factor(cohort), + cohort_key = as.character(cohort)) + + if (!"aliquot" %in% names(cdata)) cdata$aliquot <- NA_real_ + if ("aliquot.x" %in% names(cdata) || "aliquot.y" %in% names(cdata)) { + cdata$aliquot <- dplyr::coalesce(cdata$aliquot, cdata$aliquot.x, cdata$aliquot.y) + cdata <- dplyr::select(cdata, -dplyr::any_of(c("aliquot.x","aliquot.y"))) + } + + if (tolower(modality) == "rna") { + meta_norm <- meta %>% + dplyr::mutate( + Specimen_norm = normalize_specimen_like(Specimen), + cohort_key = as.character(cohort) + ) %>% + dplyr::select(Specimen, Specimen_norm, Patient, Tumor, cohort_key) + + cdata2 <- cdata %>% + dplyr::mutate(fname_norm = normalize_specimen_like(ifelse( + is.na(specimen_norm) | specimen_norm == "", fname, specimen_norm + ))) %>% + dplyr::left_join(meta_norm, by = c("cohort_key", "fname_norm" = "Specimen_norm")) + + for (nm in c("Specimen","Patient","Tumor")) { + if (!nm %in% names(cdata)) cdata[[nm]] <- NA + cdata[[nm]] <- dplyr::coalesce(cdata[[nm]], cdata2[[nm]]) + } + + if (!"Specimen" %in% names(cdata)) cdata$Specimen <- NA_character_ + if (!"Patient" %in% names(cdata)) cdata$Patient <- NA_character_ + if (!"Tumor" %in% names(cdata)) cdata$Tumor <- NA_character_ + + cdata$Patient <- dplyr::coalesce(cdata$Patient, cdata$sample_id) + cdata$Tumor <- dplyr::coalesce(cdata$Tumor, cdata$tumor) + + make_specimen <- function(pid, tmr, cond) { + pid_clean <- pid + tmr_clean <- ifelse(is.na(tmr) | tmr == "", "", paste0("_", tmr)) + cond_clean <- ifelse(is.na(cond) | cond == "", "", paste0("_", cond)) + paste0(pid_clean, tmr_clean, cond_clean) + } + need_spec <- is.na(cdata$Specimen) | cdata$Specimen == "" + if (any(need_spec)) { + cdata$Specimen[need_spec] <- make_specimen( + pid = cdata$Patient[need_spec], + tmr = cdata$Tumor[need_spec], + cond = cdata$condition_norm[need_spec] + ) + } + } + + message(" - Example cdata rows:") + print(utils::head(cdata[, intersect(c( + "fname","aliquot","cohort","Specimen","Patient","Tumor", + "sample_id","tumor","condition_raw","condition_norm","specimen_norm" + ), names(cdata)), drop = FALSE], 10)) + + rn <- cdata$fname + cdata_nofname <- dplyr::select(cdata, -fname) + + rd <- S4Vectors::DataFrame(feature_id = rownames(mat)); rownames(rd) <- rd$feature_id + + se <- SummarizedExperiment::SummarizedExperiment( + assays = S4Vectors::SimpleList(values = mat), + rowData = rd, + colData = S4Vectors::DataFrame(cdata_nofname, row.names = rn) + ) + + message(sprintf(" - SE assay dims: %d feats × %d samples", nrow(se), ncol(se))) + message(sprintf(" - colData names: %s", paste(names(SummarizedExperiment::colData(se)), collapse = ", "))) + se +} + +scale_columns_modified_z <- function(m) { + out <- m + for (j in seq_len(ncol(m))) out[, j] <- modified_zscore(m[, j]) + out +} + +normalize_by_modality <- function(se, modality) { + mtype <- tolower(modality) + mat0 <- as.matrix(SummarizedExperiment::assay(se, "values")) + c0 <- colnames(mat0) + + message(" Normalizing modality = ", modality) + if (mtype == "phospho") { + mat0[mat0 == 0] <- NA_real_ + mat1 <- filter_by_missingness(mat0) + mlog <- log2(mat1 + 0.01) + mat2 <- scale_columns_modified_z(mlog) + } else if (mtype == "global") { + mlog <- log2(mat0) + mat2 <- scale_columns_modified_z(mlog) + } else if (mtype == "rna") { + mat1 <- filter_by_missingness(mat0) + mlog <- log2(mat1 + 1) + mat2 <- scale_columns_modified_z(mlog) + } else { + abort(paste0("Unknown modality: ", modality)) + } + + mat3 <- collapse_duplicate_features(mat2) + stopifnot(identical(colnames(mat3), c0)) + + rd <- S4Vectors::DataFrame(feature_id = rownames(mat3)); rownames(rd) <- rd$feature_id + se_out <- SummarizedExperiment::SummarizedExperiment( + assays = S4Vectors::SimpleList(values = mat3), + rowData = rd, + colData = SummarizedExperiment::colData(se)[colnames(mat3), , drop = FALSE] + ) + message(sprintf(" - Normalized assay dims: %d feats × %d samples", nrow(se_out), ncol(se_out))) + se_out +} + +# Function to Combine Data (batches) + +combine_batches_intersection <- function(se_list) { + message("Combining batches (intersection of features, then cbind samples)") + mats <- lapply(se_list, function(se) as.matrix(SummarizedExperiment::assay(se, "values"))) + feats <- Reduce(intersect, lapply(mats, rownames)) + feats <- feats[!is.na(feats) & feats != ""] + message(" - Intersection feature count: ", length(feats)) + if (length(feats) == 0) stop("No common features across batches after cleaning feature IDs.") + matsI <- lapply(seq_along(mats), function(i) { + m <- mats[[i]][feats, , drop = FALSE] + message(sprintf(" Batch %d: %d feats × %d samples after intersect", i, nrow(m), ncol(m))) + m + }) + matCB <- do.call(cbind, matsI) + + cd <- do.call(S4Vectors::rbind, lapply(se_list, SummarizedExperiment::colData)) + rd <- S4Vectors::DataFrame(feature_id = rownames(matCB)); rownames(rd) <- rd$feature_id + + se <- SummarizedExperiment( + assays = S4Vectors::SimpleList(values = matCB), + rowData = rd, + colData = cd + ) + message(sprintf(" - Combined assay dims: %d feats × %d samples", nrow(se), ncol(se))) + message(" - Head(sample names) in combined assay:") + print(utils::head(colnames(SummarizedExperiment::assay(se, "values")), 6)) + message(" - Head(rownames) in combined colData (should match):") + print(utils::head(rownames(SummarizedExperiment::colData(se)), 6)) + invisible(se) +} + +# Combat Function. (Lots of messages to help debug) + +combat_by_cohort <- function(se) { + message("Running ComBat by cohort (batch-only; mean.only = FALSE)") + suppressPackageStartupMessages(library(sva)) + + mat <- as.matrix(SummarizedExperiment::assay(se, "values")) + message(sprintf("Matrix dims before ComBat: %d features × %d samples", nrow(mat), ncol(mat))) + + n_bad <- sum(!is.finite(mat)) + if (n_bad > 0) message(" Replacing ", n_bad, " non-finite values with 0.") + mat[!is.finite(mat)] <- 0 + + cd <- as.data.frame(SummarizedExperiment::colData(se)) + cd <- cd[colnames(mat), , drop = FALSE] + if (!"cohort" %in% names(cd)) stop("colData must contain 'cohort' for ComBat batching.") + + batch <- droplevels(as.factor(cd$cohort)) + message(" Batch table (pre-drop):"); print(table(batch, useNA = "ifany")) + + keep <- !is.na(batch) + if (any(!keep)) { + message(" Dropping ", sum(!keep), " samples with NA cohort before ComBat.") + mat <- mat[, keep, drop = FALSE] + batch <- droplevels(batch[keep]) + cd <- cd[keep, , drop = FALSE] + } + + message(" Final check — ncol(mat)=", ncol(mat), "; length(batch)=", length(batch)) + message(" Batch table (final):"); print(table(batch, useNA = "ifany")) + + pre_by_cohort <- tapply(colMeans(mat), batch, sd) + message(" Pre-ComBat: SD of column means by cohort:"); print(pre_by_cohort) + + cb <- sva::ComBat(dat = mat, batch = batch, mean.only = FALSE, par.prior = TRUE) + + post_by_cohort <- tapply(colMeans(cb), batch, sd) + message(" Post-ComBat: SD of column means by cohort:"); print(post_by_cohort) + + SummarizedExperiment::assay(se, "values") <- cb + message(" Matrix dims after ComBat: ", nrow(cb), " × ", ncol(cb)) + invisible(se) +} + +# Plot Functions (PCA) + more debug messages + +se_to_long <- function(se, modality) { + feature_col <- if (tolower(modality) == "global") "Gene" else "feature_id" + + avals <- as.data.frame(SummarizedExperiment::assay(se, "values"), check.names = FALSE) + avals[[feature_col]] <- rownames(avals) + + long <- avals |> + tidyr::pivot_longer(cols = -all_of(feature_col), names_to = "fname", values_to = "correctedAbundance") + + cd <- coldata_tbl(se) + + if (!"aliquot" %in% names(cd)) cd$aliquot <- NA_real_ + if ("aliquot.x" %in% names(cd) || "aliquot.y" %in% names(cd)) { + cd$aliquot <- dplyr::coalesce(cd$aliquot, cd$aliquot.x, cd$aliquot.y) + } + + want <- c("fname","aliquot","cohort","Specimen","Patient","Tumor", + "fname_base","fname_stem", + "sample_id","tumor","condition_raw","condition_norm","specimen_norm") + have <- intersect(want, names(cd)) + long1 <- dplyr::left_join(long, cd[, have, drop = FALSE], by = "fname") + + message(" - se_to_long(): non-NA counts to Patient=", + sum(!is.na(long1$Patient)), "; Tumor=", sum(!is.na(long1$Tumor)), + "; Specimen=", sum(!is.na(long1$Specimen)), "; cohort=", sum(!is.na(long1$cohort))) + distinct(long1) +} + +pca_df_present_in_all <- function(se) { + message("Preparing PCA (features present in ALL samples)") + mat <- as.matrix(SummarizedExperiment::assay(se, "values")) + + keep_rows <- apply(mat, 1, function(r) all(is.finite(r))) + n_keep <- sum(keep_rows); n_all <- nrow(mat) + message(" - Kept ", n_keep, " / ", n_all, " features with complete data for PCA") + if (n_keep < 2) stop("Too few complete features for PCA after intersection filter.") + + pcs <- prcomp(t(mat[keep_rows, , drop = FALSE])) + + cd <- coldata_tbl(se) + df <- as.data.frame(pcs$x[, 1:2, drop = FALSE]) + df$fname <- rownames(df) + + df1 <- dplyr::left_join(df, cd, by = "fname") + + message(" - colData columns present: ", paste(setdiff(names(cd), c("fname","fname_base","fname_stem")), collapse = ", ")) + message(" - Non-NA counts in colData: Patient=", sum(!is.na(cd$Patient)), + "; Tumor=", sum(!is.na(cd$Tumor)), "; Specimen=", sum(!is.na(cd$Specimen)), + "; cohort=", sum(!is.na(cd$cohort))) + message(" - After join: n rows = ", nrow(df1)) + message(" - Non-NA counts after join: Patient=", sum(!is.na(df1$Patient)), + "; Tumor=", sum(!is.na(df1$Tumor)), "; Specimen=", sum(!is.na(df1$Specimen)), + "; cohort=", sum(!is.na(df1$cohort))) + + if (sum(!is.na(df1$Patient)) == 0) { + message(" ! Warning: Patient is NA for all samples after join. Will color/shape by cohort.") + df1$Patient_fallback <- as.character(df1$cohort) + df1$Tumor_fallback <- as.character(df1$cohort) + message(" - DEBUG: head(df1$fname):"); print(utils::head(df1$fname, 6)) + message(" - DEBUG: head(cd$fname):"); print(utils::head(cd$fname, 6)) + } + + if ("Specimen" %in% names(df1)) { + df1$Tumor <- stringr::str_remove(stringr::str_extract(df1$Specimen, "_T\\d+"), "^_") + } + + df1 +} + +plot_pca <- function(pc_df, title_text, pcols = NULL) { + color_col <- if ("Patient" %in% names(pc_df) && any(!is.na(pc_df$Patient))) { + "Patient" + } else if ("condition_norm" %in% names(pc_df) && any(!is.na(pc_df$condition_norm))) { + "condition_norm" + } else if ("tumor" %in% names(pc_df) && any(!is.na(pc_df$tumor))) { + "tumor" + } else { + "Patient_fallback" + } + + shape_col <- if ("Tumor" %in% names(pc_df) && any(!is.na(pc_df$Tumor))) { + "Tumor" + } else if ("tumor" %in% names(pc_df) && any(!is.na(pc_df$tumor))) { + "tumor" + } else { + "Tumor_fallback" + } + + g <- ggplot(pc_df, aes(PC1, PC2, col = .data[[color_col]])) + + geom_point(aes(shape = .data[[shape_col]]), size = 3) + + labs(title = title_text, color = color_col, shape = shape_col) + + theme_bw() + if (!is.null(pcols) && color_col == "Patient") g <- g + scale_color_manual(values = pcols) + print(g) + g +} + +plot_hist <- function(se, title_text) { + cd <- coldata_tbl(se) + df <- as.data.frame(SummarizedExperiment::assay(se, "values")) |> + tidyr::pivot_longer(everything(), names_to = "fname", values_to = "val") |> + dplyr::left_join(cd[, c("fname","cohort")], by = "fname") + g <- ggplot(df, aes(x = val, fill = as.factor(cohort))) + + geom_histogram(bins = 60, alpha = 0.9) + + labs(title = title_text, x = "Value", fill = "Cohort") + + theme_bw() + print(g) + g +} + +# Upload function + +perform_uploads <- function(paths, syn, parent_id) { + if (is.null(parent_id) || length(paths) == 0) return(invisible(NULL)) + message("All steps succeeded — uploading ", length(paths), " file(s) to Synapse…") + for (p in paths) { + fullp <- normalizePath(p, winslash = "/", mustWork = FALSE) + message(" Uploading: ", basename(p), " (", fullp, ")") + f <- syn$store(synapser::File(p, parentId = parent_id)) + message(" Uploaded: ", basename(p), " (local: ", fullp, ") to Synapse ID: ", f$properties$id) + } + message("Uploads complete.") +} + + + +# ------------------------------- Main entry ---------------------------------- +# This is how we call the function / pipeline + +run_modality <- function( + modality, + batches, + meta, + syn, + drop_name_substrings = NULL, + out_dir = ".", + out_prefix = NULL, + upload_parent_id = NULL, + pcols = NULL, + write_outputs = TRUE, # master toggle to write CSV/PDF & upload + save_basename = NULL, # override base name used in output files + do_batch_correct = TRUE # if FALSE, skip ComBat and use combined matrix +) { + message("==================================================") + message("Starting run_modality(): ", modality) + message("Output directory: ", normalizePath(out_dir, winslash = "/", mustWork = FALSE)) + if (!dir.exists(out_dir)) dir.create(out_dir, recursive = TRUE) + if (is.null(out_prefix)) out_prefix <- gsub("[^A-Za-z0-9]+", "_", tolower(modality)) + file_stem <- if (!is.null(save_basename) && nzchar(save_basename)) save_basename else out_prefix + + upload_queue <- character(0) + results <- NULL + + tryCatch({ + drop_name <- make_dropper(drop_name_substrings) + build_ids <- pick_builder(modality) + + # ---- Per-batch normalization ------------------------------------------------ + se_list <- vector("list", length(batches)) + for (i in seq_along(batches)) { + message("--------------------------------------------------") + message("Batch ", i, " of ", length(batches)) + b <- batches[[i]] + + wide <- read_wide_from_synapse(syn, b$syn_id) + + if (tolower(modality) == "phospho") { + message(" Preprocessing phospho table: drop blank Gene.Names; build site ids") + wide <- wide %>% dplyr::filter(!is.na(.data$Gene.Names), .data$Gene.Names != "") + } else if (tolower(modality) == "global") { + message(" Preprocessing global table: split multi-symbol rows (Genes by ';')") + wide <- tidyr::separate_rows(wide, Genes, sep = ";") |> + dplyr::mutate(Genes = trimws(Genes)) |> + dplyr::filter(!is.na(Genes) & Genes != "") + } else if (tolower(modality) == "rna") { + message(" Preprocessing RNA table: (using merged Salmon matrix; samples start after gene_id/gene_name)") + } + + feats <- build_ids(wide) + ok <- !is.na(feats) & feats != "" + if (!all(ok)) { + message(" Dropping ", sum(!ok), " empty/NA feature IDs before SE construction.") + wide <- wide[ok, , drop = FALSE] + feats <- feats[ok] + } + + fallback_col <- if (tolower(modality) == "rna") 3 else 5 + first_col <- if (!is.null(b$value_start_col)) b$value_start_col else detect_value_start_col(wide, fallback = fallback_col) + + fnmap <- parse_fnames(colnames(wide)[first_col:ncol(wide)], b$fname_aliquot_index, b$cohort) + if (tolower(modality) == "rna" && any(is.na(fnmap$aliquot))) { + message(" ! Note: ", sum(is.na(fnmap$aliquot)), " RNA sample(s) with NA aliquot after parsing (expected for RNA headers).") + } + + se0 <- make_se(wide, value_start_col = first_col, feature_ids = feats, + fnames_df = fnmap, meta = meta, drop_name = drop_name, modality = modality) + se_n <- normalize_by_modality(se0, modality) + se_list[[i]] <- se_n + + if (write_outputs) { + message(" Writing per-batch normalized long table (pre-ComBat)") + batch_long <- se_to_long(se_n, modality) + batch_tag <- paste0("batch", b$cohort) + batch_path <- file.path(out_dir, paste0(file_stem, "_", batch_tag, "_normalized_long.csv")) + readr::write_csv(batch_long, batch_path) + upload_queue <- c(upload_queue, batch_path) + } + } + + # ---- Combine (INTERSECTION) & pre-QC --------------------------------------- + message("--------------------------------------------------") + se_combined <- combine_batches_intersection(se_list) + + message(" Pre-QC plots (PCA & histogram) on combined (pre-ComBat)") + pre_pc_df <- pca_df_present_in_all(se_combined) + pre_pca <- plot_pca(pre_pc_df, paste0(modality, " samples"), pcols = pcols) + pre_hist <- plot_hist(se_combined, paste0(modality, ": value distribution (pre-ComBat)")) + if (write_outputs) { + pre_pca_pdf <- file.path(out_dir, paste0(file_stem, "_preComBat_PCA.pdf")) + pre_hist_pdf <- file.path(out_dir, paste0(file_stem, "_preComBat_Hist.pdf")) + ggsave(pre_pca_pdf, pre_pca, width = 7, height = 4.5, device = cairo_pdf) + ggsave(pre_hist_pdf, pre_hist, width = 7, height = 4.5, device = cairo_pdf) + upload_queue <- c(upload_queue, pre_pca_pdf, pre_hist_pdf) + } + + # ---- Optional ComBat (batch-only) ------------------------------------------ + if (isTRUE(do_batch_correct)) { + message("--------------------------------------------------") + se_post <- combat_by_cohort(se_combined) + post_suffix <- "_batchCorrected" + post_title <- paste0("Batch-corrected ", modality, " samples") + } else { + message("--------------------------------------------------") + message("Skipping ComBat per do_batch_correct=FALSE; using combined matrix as 'post'.") + se_post <- se_combined + post_suffix <- "_noBatchCorrect" + post_title <- paste0("Combined ", modality, " samples (no ComBat)") + } + + # ---- Exports ---------------------------------------------------------------- + message(" Building long tables") + long_pre <- se_to_long(se_combined, modality) |> + dplyr::filter(is.finite(correctedAbundance)) + long_post <- se_to_long(se_post, modality) + + if (write_outputs) { + path_pre <- file.path(out_dir, paste0(file_stem, "_preComBat_long.csv")) + path_post <- file.path(out_dir, paste0(file_stem, post_suffix, ".csv")) + write_csv(long_pre, path_pre) + write_csv(long_post, path_post) + upload_queue <- c(upload_queue, path_pre, path_post) + } + + # ---- Post (either ComBat or not) QC ---------------------------------------- + message(" Post-QC plots (PCA & histogram)") + pc_df <- pca_df_present_in_all(se_post) + gpca <- plot_pca(pc_df, post_title, pcols = pcols) + ghist <- plot_hist(se_post, paste0(modality, ": value distribution", ifelse(isTRUE(do_batch_correct), "", " (no ComBat)"))) + if (write_outputs) { + post_pca_pdf <- file.path(out_dir, paste0(file_stem, ifelse(isTRUE(do_batch_correct), "_PCA.pdf", "_PCA_noComBat.pdf"))) + post_hist_pdf <- file.path(out_dir, paste0(file_stem, ifelse(isTRUE(do_batch_correct), "_Hist.pdf", "_Hist_noComBat.pdf"))) + ggsave(post_pca_pdf, gpca, width = 7, height = 4.5, device = cairo_pdf) + ggsave(post_hist_pdf, ghist, width = 7, height = 4.5, device = cairo_pdf) + upload_queue <- c(upload_queue, post_pca_pdf, post_hist_pdf) + } + + # ---- Pack results for return ------------------------------------------------ + results <- list( + se_batches = se_list, + se_combined = se_combined, + se_corrected = if (isTRUE(do_batch_correct)) se_post else NULL, + se_post = se_post, # always populated (corrected or not) + did_combat = isTRUE(do_batch_correct), + long_pre = long_pre, + long_post = long_post, + pca_df_pre = pre_pc_df, + pca_df_post = pc_df, + plots = list(pre_pca = pre_pca, pre_hist = pre_hist, pca = gpca, hist = ghist), + files = if (write_outputs) list(queued = upload_queue) else list() + ) + + # ---- FINAL STEP: Uploads ---------------------------------------------------- + if (write_outputs && !is.null(upload_parent_id)) { + perform_uploads(upload_queue, syn, upload_parent_id) + } else if (!write_outputs) { + message("write_outputs=FALSE — skipping all writes/uploads.") + } else { + message("No upload_parent_id provided — skipping uploads.") + } + + }, error = function(e) { + message("ERROR: run_modality() failed for ", modality, ". No uploads were attempted.") + message(" ", conditionMessage(e)) + message("------- DEBUG SNAPSHOT -------") + message(" traceback:"); print(sys.calls()) + message(" sessionInfo():"); print(utils::sessionInfo()) + message("------- END DEBUG -----------") + stop(e) + }) + + message("run_modality() finished successfully for: ", modality) + results +} diff --git a/02_run_normalize_omics.Rmd b/02_run_normalize_omics.Rmd new file mode 100644 index 0000000..5c852ec --- /dev/null +++ b/02_run_normalize_omics.Rmd @@ -0,0 +1,165 @@ +--- +title: "run_normalize_omics" +author: "JJ" +date: "2025-11-04" +output: html_document +--- + +# Get Helper Scripts +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +library(synapser) +synLogin() + +syn <- list(get = synapser::synGet, store = synapser::synStore) + + +# Load helper metadata (your cNF_helper_code.R defines 'meta' and 'pcols') +source("cNF_helper_code.R") + +# Source the pipeline +source("02_normalize_batchcorrect_omics.R") + +``` + + +# Run batch correction / normalization across phospho, global, and rna samples. + +```{r} + +# --------------------------------------------------------------------------- +# run_modality() — quick reference for args & expected batch structure +# --------------------------------------------------------------------------- +# Args: +# modality : Character. One of "Global", "Phospho", or "rna". +# Controls feature-ID parsing and normalization rules. +# +# batches : List of per-cohort lists describing each input table. +# See “Batch spec” below. +# +# meta : Data frame with sample metadata. Must contain at least: +# - cohort (int or factor; matches batches[[i]]$cohort) +# - Specimen, Patient, Tumor (optional but used if present) +# For RNA, if aliquot joins fail, headers are matched to +# meta$Specimen using a normalized form. Generated by cNF_helper_code.R +# +# syn : An initialized synapser client (e.g., syn <- synapser::synLogin()). +# +# drop_name_substrings: Character vector of substrings. Any sample column whose +# name contains one of these (regex OR) is dropped. +# Use NULL or character(0) to keep all samples. +# +# out_dir : Directory where outputs/QC are written. Created if missing. +# +# out_prefix : String used in filenames (e.g., "global", "phospho", "rna"). +# If save_basename is NULL, this is also used as the basename. +# +# upload_parent_id : Synapse folder/entity ID to upload written files. +# If NULL, no uploads occur. +# +# pcols : Optional named color vector for plotting (e.g., by patient). +# If NULL, ggplot defaults are used. +# +# write_outputs : Logical. If TRUE, write CSVs/PDFs to out_dir (and upload +# when upload_parent_id is set). If FALSE, nothing is written/ +# uploaded; results are returned in-memory only. +# +# save_basename : Optional string to control the root of output filenames. +# If NULL, falls back to out_prefix. Useful when you want the +# directory name (out_dir) and the file basename to differ. +# +# do_batch_correct : Logical. If TRUE, applies ComBat across cohorts (batch-only). +# If FALSE, skips ComBat; filenames will include “_noBatchCorrect”. +# +# Returns: +# A list with: +# - se_batches : List of per-batch SummarizedExperiment objects (normalized). +# - se_combined : Combined SE after feature intersection. +# - se_corrected : Post-ComBat SE (or the same as combined if do_batch_correct=FALSE). +# - long_pre : Long-format data frame pre-ComBat (finite values only). +# - long_post : Long-format data frame post-ComBat (or pre if no ComBat). +# - pca_df_pre / pca_df_post : Data frames used for PCA scatter plots. +# - plots : ggplot objects for PCA & histograms. +# - files : Paths of any written files (if write_outputs=TRUE). +# +# +# --------------------------------------------------------------------------- + + +# Substrings to drop (These were the protocol optimization samples) +drop_subs <- c( + "cNF_organoid_DIA_G_02_11Feb25", + "cNF_organoid_DIA_G_05_11Feb25", + "cNF_organoid_DIA_G_06_11Feb25", + "cNF_organoid_DIA_P_02_29Jan25", + "cNF_organoid_DIA_P_05_11Feb25", + "cNF_organoid_DIA_P_06_11Feb25" +) + +# PHOSPHO BATCHES +phospho_batches <- list( + list(syn_id = "syn69963552", cohort = 1, value_start_col = 5, fname_aliquot_index = 8), + list(syn_id = "syn69947351", cohort = 2, value_start_col = 5, fname_aliquot_index = 9) +) + +# Run +phospho <- run_modality( + modality = "Phospho", + batches = phospho_batches, + meta = meta, + syn = syn, + drop_name_substrings = drop_subs, + out_dir = "phospho_test", + out_prefix = "phospho", + upload_parent_id = "syn70078365", + pcols = pcols, + write_outputs = FALSE, + save_basename = "Phospho_batch_corrected", + do_batch_correct = TRUE +) + +# GLOBAL BATCHES +global_batches <- list( + list(syn_id = "syn69947355", cohort = 1, value_start_col = 5, fname_aliquot_index = 6), + list(syn_id = "syn69947352", cohort = 2, value_start_col = 5, fname_aliquot_index = 7) +) + +global <- run_modality( + modality = "Global", + batches = global_batches, + meta = meta, + syn = syn, + drop_name_substrings = drop_subs, + out_dir = "global_test", + out_prefix = "global", + upload_parent_id = "syn70078365", + pcols = pcols, + write_outputs = FALSE, + save_basename = "Global_batch_corrected", + do_batch_correct = TRUE +) + + +# RNA BATCHES +rna_batches <- list( + list(syn_id = "syn66352931", cohort = 1, value_start_col = 5, fname_aliquot_index = 6), + list(syn_id = "syn70765053", cohort = 2, value_start_col = 5, fname_aliquot_index = 7) +) + +rna <- run_modality( + modality = "rna", + batches = rna_batches, + meta = meta, + syn = syn, + drop_name_substrings = drop_subs, + out_dir = "rna_test", + out_prefix = "rna", + upload_parent_id = "syn71099587", + pcols = pcols, + write_outputs = FALSE, + save_basename = "RNA_Matrix_no_batch_correct", + do_batch_correct = FALSE #Note this is set to false right now. Not needed for RNA +) + + +``` diff --git a/03_analyze_modality_correlations.R b/03_analyze_modality_correlations.R new file mode 100644 index 0000000..10fa56e --- /dev/null +++ b/03_analyze_modality_correlations.R @@ -0,0 +1,279 @@ +# analyze_modality.R +suppressPackageStartupMessages({ + library(dplyr) + library(tidyr) + library(tibble) + library(ggplot2) + library(pheatmap) +}) + +dir.create("figs", showWarnings = FALSE) + + +# Helpers + +make_feature_matrix <- function(df_long, shared_ids, sample_col, feature_col, value_col) { + # Strict cleaning: trim to character, drop NA/blank IDs, then pivot + df <- df_long %>% + ungroup() %>% + # keep only shared sample IDs first (as before) + dplyr::filter(.data[[sample_col]] %in% shared_ids) %>% + # coerce and trim IDs + mutate( + !!sample_col := trimws(as.character(.data[[sample_col]])), + !!feature_col := trimws(as.character(.data[[feature_col]])) + ) + + # Drop rows with NA/blank sample/feature IDs + bad_sample <- is.na(df[[sample_col]]) | df[[sample_col]] == "" + bad_feature <- is.na(df[[feature_col]]) | df[[feature_col]] == "" + if (any(bad_sample)) message("[make_feature_matrix] Dropping ", sum(bad_sample), " rows with NA/blank ", sample_col) + if (any(bad_feature)) message("[make_feature_matrix] Dropping ", sum(bad_feature), " rows with NA/blank ", feature_col) + df <- df[!(bad_sample | bad_feature), , drop = FALSE] + + if (!nrow(df)) { + warning("[make_feature_matrix] No rows left after cleaning; returning empty frame.") + out <- data.frame(check.names = FALSE) + return(out) + } + + # Pivot to wide + wide <- df %>% + dplyr::select(all_of(c(sample_col, feature_col, value_col))) %>% + tidyr::pivot_wider( + names_from = all_of(feature_col), + values_from = all_of(value_col), + values_fill = 0, + values_fn = mean + ) %>% + as.data.frame(check.names = FALSE) + + # Guard against NA/blank rownames after pivot + rn <- wide[[sample_col]] + bad_rn <- is.na(rn) | rn == "" + if (any(bad_rn)) { + message("[make_feature_matrix] Removing ", sum(bad_rn), " rows with NA/blank rownames after pivot.") + wide <- wide[!bad_rn, , drop = FALSE] + rn <- rn[!bad_rn] + } + + if (!nrow(wide)) { + warning("[make_feature_matrix] Wide table is empty after removing bad rownames; returning empty frame.") + out <- data.frame(check.names = FALSE) + return(out) + } + + # Finalize rownames (must be unique, non-empty) + rownames(wide) <- make.unique(as.character(rn), sep = "_dup") + wide[[sample_col]] <- NULL + wide +} + +make_drug_matrix <- function( + fits, metric = "uM_viability", + sample_col = "improve_sample_id", + drug_col = "improve_drug_id", + value_col = "dose_response_value", + metric_col = "dose_response_metric" +) { + fits %>% + dplyr::filter(.data[[metric_col]] == metric) %>% + dplyr::select(all_of(c(sample_col, drug_col, value_col))) %>% + tidyr::pivot_wider( + names_from = all_of(drug_col), + values_from = all_of(value_col), + values_fn = mean + ) %>% + tibble::column_to_rownames(sample_col) +} + +summarize_drugs <- function( + fits, metric = "uM_viability", metric_col = "dose_response_metric", + outdir = "figs", rotate_x = 45 +) { + ds <- fits %>% + dplyr::filter(.data[[metric_col]] == metric) %>% + group_by(.data$improve_drug_id) %>% + distinct() %>% + summarize( + meanResponse = mean(.data$dose_response_value, na.rm = TRUE), + nMeasured = n_distinct(.data$improve_sample_id), + variability = sd(.data$dose_response_value, na.rm = TRUE), + .groups = "drop" + ) + + p_eff <- ds %>% + arrange(desc(.data$meanResponse)) %>% + dplyr::filter(.data$meanResponse < 0.5) %>% + ggplot(aes(y = .data$meanResponse, x = .data$improve_drug_id, + colour = .data$nMeasured, size = .data$variability)) + + geom_point() + + theme_minimal() + + theme(axis.text.x = element_text(angle = rotate_x, hjust = 1)) + + labs(title = "Most efficacious drugs", + y = "Mean cell viability (fraction)", x = "Drug") + + p_var <- ds %>% + arrange(desc(.data$variability)) %>% + dplyr::filter(.data$variability > 0.15) %>% + ggplot(aes(y = .data$meanResponse, x = .data$improve_drug_id, + colour = .data$nMeasured, size = .data$variability)) + + geom_point() + + theme_minimal() + + theme(axis.text.x = element_text(angle = rotate_x, hjust = 1)) + + labs(title = "Most variable drugs", + y = "Mean cell viability (fraction)", x = "Drug") + + ggsave(file.path(outdir, "most_efficacious.pdf"), p_eff, width = 12, height = 8, dpi = 300) + ggsave(file.path(outdir, "most_variable.pdf"), p_var, width = 12, height = 8, dpi = 300) + + list(summary = ds, p_eff = p_eff, p_var = p_var) +} + +compute_cors <- function(drug_mat, feat_mat, shared_samples = NULL) { + if (is.null(shared_samples)) { + shared_samples <- base::intersect(rownames(drug_mat), rownames(feat_mat)) + } + if (length(shared_samples) == 0L) { + return(tibble( + drug = character(), feature = character(), + cor = numeric(), pval = numeric(), fdr = numeric(), + direction = character() + )) + } + + drug_mat <- drug_mat[shared_samples, , drop = FALSE] + feat_mat <- feat_mat[shared_samples, , drop = FALSE] + + cres <- suppressWarnings( + stats::cor(drug_mat, feat_mat, use = "pairwise.complete.obs", method = "spearman") + ) %>% + as.data.frame() %>% + tibble::rownames_to_column("drug") %>% + tidyr::pivot_longer(cols = - "drug", names_to = "feature", values_to = "cor") + + csig <- do.call(rbind, lapply(colnames(drug_mat), function(d) { + do.call(rbind, lapply(colnames(feat_mat), function(f) { + dv <- drug_mat[, d]; fv <- feat_mat[, f] + p <- NA_real_ + if (sum(is.finite(dv) & is.finite(fv)) >= 3) { + p <- tryCatch( + stats::cor.test(dv, fv, method = "spearman", use = "pairwise.complete.obs")$p.value, + error = function(e) NA_real_ + ) + } + c(drug = d, feature = f, pval = p) + })) %>% + as.data.frame() + })) %>% + as.data.frame() %>% + mutate(pval = as.numeric(.data$pval)) %>% + mutate(fdr = p.adjust(.data$pval, method = "BH")) + + left_join(cres, csig, by = c("drug","feature")) %>% + mutate(direction = ifelse(.data$cor < 0, "neg", "pos")) +} + +summarize_correlated_features <- function(cor_tbl, fdr_thresh = 0.25, outdir = "figs") { + if (nrow(cor_tbl) == 0L) return(list(summary = tibble(), plot = NULL)) + corsummary <- cor_tbl %>% + dplyr::filter(is.finite(.data$fdr), !is.na(.data$fdr), .data$fdr < fdr_thresh) %>% + mutate(direction = ifelse(.data$cor > 0, "pos", "neg")) %>% + group_by(.data$drug, .data$direction) %>% + summarize(features = n(), meanCor = mean(.data$cor), .groups = "drop") + + if (nrow(corsummary) == 0L) return(list(summary = corsummary, plot = NULL)) + + p <- corsummary %>% + dplyr::filter(.data$features > 1) %>% + ggplot(aes(x = .data$drug, y = .data$features, fill = .data$direction)) + + geom_col(position = "dodge") + + theme_minimal() + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) + + labs(title = paste0("Significant feature counts per drug (FDR < ", fdr_thresh, ")"), + x = "Drug", y = "# Features") + + ggsave(file.path(outdir, "cor_features_by_drug.pdf"), p, width = 12, height = 6, units = "in") + list(summary = corsummary, plot = p) +} + +# --------------------------- +# Main wrapper +# --------------------------- +# Returns: list(drug_mat, feat_mat, shared_ids, cor_tbl, cor_summary, cor_plot, drug_summary) +analyze_modality <- function( + fits, + df_long, + sample_col, # e.g., "Specimen" + feature_col, # e.g., "feature_id" | "Gene" | "site" + value_col, # e.g., "correctedAbundance" + metric = "uM_viability", + outdir = "figs", + heatmap_filename = "drug_heatmap_large.pdf", + fdr_thresh = 0.25 +) { + # Pre-intersection like original (all metrics) + shared_ids <- base::intersect(unique(fits$improve_sample_id), unique(df_long[[sample_col]])) + + # Feature matrix over shared IDs + feat_mat <- make_feature_matrix( + df_long = df_long, + shared_ids = shared_ids, + sample_col = sample_col, + feature_col = feature_col, + value_col = value_col + ) + + # Drug matrix for the exact metric (no normalization/fallbacks) + drug_mat <- make_drug_matrix( + fits = fits, + metric = metric, + sample_col = "improve_sample_id", + drug_col = "improve_drug_id", + value_col = "dose_response_value", + metric_col = "dose_response_metric" + ) + + # Summaries & heatmap (original style) + dsum <- summarize_drugs( + fits, metric = metric, metric_col = "dose_response_metric", outdir = outdir + ) + + if (!is.null(heatmap_filename) && nrow(drug_mat) > 0 && ncol(drug_mat) > 0) { + fulldrugs <- dsum$summary %>% + dplyr::filter(.data$nMeasured == nrow(drug_mat)) %>% + pull(.data$improve_drug_id) + + subm <- drug_mat[, colnames(drug_mat) %in% fulldrugs, drop = FALSE] + if (nrow(subm) > 1 && ncol(subm) > 0) { + pheatmap::pheatmap( + as.matrix(subm), + filename = file.path(outdir, heatmap_filename), + width = 28, height = 16, + angle_col = 45, fontsize_col = 6, + cluster_rows = TRUE, cluster_cols = TRUE, + show_rownames = TRUE, show_colnames = TRUE + ) + } + } + + # Correlations (only if overlap) + shared_after <- base::intersect(rownames(drug_mat), rownames(feat_mat)) + cor_tbl <- if (length(shared_after) > 0L) { + compute_cors(drug_mat, feat_mat, shared_samples = shared_after) + } else { + tibble(drug = character(), feature = character(), cor = numeric(), + pval = numeric(), fdr = numeric(), direction = character()) + } + cor_res <- summarize_correlated_features(cor_tbl, fdr_thresh = fdr_thresh, outdir = outdir) + + list( + drug_mat = drug_mat, + feat_mat = feat_mat, + shared_ids = shared_ids, + cor_tbl = cor_tbl, + cor_summary = cor_res$summary, + cor_plot = cor_res$plot, + drug_summary = dsum$summary + ) +} diff --git a/04_analyze_modality_and_pathway_enrich.Rmd b/04_analyze_modality_and_pathway_enrich.Rmd new file mode 100644 index 0000000..c55b038 --- /dev/null +++ b/04_analyze_modality_and_pathway_enrich.Rmd @@ -0,0 +1,114 @@ +--- +title: "run_analyze_modality_and_enrichment" +author: "JJ" +date: "2025-11-12" +output: html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) + + +source("cNF_helper_code.R") +source("03_analyze_modality_correlations.R") +source("04_leapr_biomarker.R") +``` + + +```{r} + +drugs <- readr::read_tsv(synGet("syn69947322")$path) + +#RNA +rlong <- readr::read_csv(synGet('syn71333780')$path) +#Global +glong <- readr::read_csv(synGet('syn70078416')$path) +#Phospho +plong <- readr::read_csv(synGet('syn70078415')$path) + +corr_rna <- analyze_modality( + fits = drugs, + df_long = rlong, + sample_col = "Specimen", + feature_col = "feature_id", + value_col = "correctedAbundance", + metric = "uM_viability", + heatmap_filename = "rna_drug_heatmap_large_viability.pdf", + outdir = "new_figs" +) + +corr_global <- analyze_modality( + fits = drugs, + df_long = glong, + sample_col = "Specimen", + feature_col = "Gene", + value_col = "correctedAbundance", + metric = "uM_viability", + heatmap_filename = "global_drug_heatmap_large_viability.pdf", + outdir = "new_figs" +) + +corr_phospho <- analyze_modality( + fits = drugs, # syn69947322 TSV already read + df_long = plong, # your RNA or proteo/phospho long table + sample_col = "Specimen", + feature_col = "site", + value_col = "correctedAbundance", + metric = "uM_viability", + heatmap_filename = "phospho_drug_heatmap_large_viability.pdf", + outdir = "new_figs" +) + +``` + + +```{r warning=FALSE} + +res_bio_rna <- run_leapr_directional_one_cached( + drugs = drugs, + df_long = rlong, + sample_col = "Specimen", + feature_col = "feature_id", + value_col = "correctedAbundance", + omic_label = "rna", + cache_path = "leapR_RNA_krbpaths_enrichment_direction_split.Rdata", + write_csvs = TRUE, + always_rerun = FALSE, + test_one = FALSE, + geneset_name = "krbpaths" +) + + +res_bio_global <- run_leapr_directional_one_cached( + drugs = drugs, + df_long = glong, + sample_col = "Specimen", + feature_col = "Gene", + value_col = "correctedAbundance", + omic_label = "global", + cache_path = "leapR_Global_krbpaths_enrichment_direction_split.Rdata", + write_csvs = TRUE, + always_rerun = FALSE, + test_one = FALSE, + geneset_name = "krbpaths" +) + + +res_bio_phospho <- run_leapr_directional_one_cached( + drugs = drugs, + df_long = plong, + sample_col = "Specimen", + feature_col = "site", + value_col = "correctedAbundance", + omic_label = "phospho", + cache_path = "leapR_Phospho_kinasesubstrates_enrichment_direction_split.Rdata", + write_csvs = TRUE, + always_rerun = FALSE, + test_one = FALSE, + geneset_name = "kinasesubstrates" +) + + + + +``` diff --git a/04_leapr_biomarker.R b/04_leapr_biomarker.R new file mode 100644 index 0000000..f679d82 --- /dev/null +++ b/04_leapr_biomarker.R @@ -0,0 +1,467 @@ +# leapr_biomarker.R + +suppressPackageStartupMessages({ + library(dplyr) + library(tidyr) + library(readr) + library(stringr) + library(tibble) + library(SummarizedExperiment) + library(S4Vectors) + library(leapR) + library(ggplot2) + library(grDevices) +}) + +# ----------------------------- +# Helpers +# ----------------------------- +# Long to wide matrix (rows = samples, cols = features) +# df_long columns: +# sample_col : sample IDs (e.g., "Specimen") +# feature_col : feature IDs (e.g., "feature_id", "Gene", "site") depending on input group +# value_col : numeric values (e.g., "correctedAbundance") +long_to_matrix <- function(df_long, sample_col, feature_col, value_col) { + if (is.null(df_long) || !nrow(df_long)) return(NULL) + + df <- df_long |> + dplyr::mutate( + !!sample_col := trimws(as.character(.data[[sample_col]])), + !!feature_col := trimws(as.character(.data[[feature_col]])) + ) + + # Drop NA/blank sample or feature IDs with messages + bad_sample <- is.na(df[[sample_col]]) | df[[sample_col]] == "" + bad_feature <- is.na(df[[feature_col]]) | df[[feature_col]] == "" + n_bad_s <- sum(bad_sample, na.rm = TRUE) + n_bad_f <- sum(bad_feature, na.rm = TRUE) + if (n_bad_s > 0) message("[long_to_matrix] Dropping ", n_bad_s, " rows with NA/blank ", sample_col) + if (n_bad_f > 0) message("[long_to_matrix] Dropping ", n_bad_f, " rows with NA/blank ", feature_col) + df <- df[!(bad_sample | bad_feature), , drop = FALSE] + + if (!nrow(df)) { + warning("[long_to_matrix] No rows left after removing NA/blank sample/feature IDs.") + return(NULL) + } + + # Pivot to wide + wide <- df %>% + dplyr::select( + !!rlang::sym(sample_col), + !!rlang::sym(feature_col), + !!rlang::sym(value_col) + ) %>% + tidyr::pivot_wider( + names_from = !!rlang::sym(feature_col), + values_from = !!rlang::sym(value_col), + values_fill = 0, + values_fn = mean + ) %>% + as.data.frame(check.names = FALSE) + + rn <- wide[[sample_col]] + bad_rn <- is.na(rn) | rn == "" + if (any(bad_rn)) { + message("[long_to_matrix] Removing ", sum(bad_rn), " rows with NA/blank rownames after pivot.") + wide <- wide[!bad_rn, , drop = FALSE] + rn <- rn[!bad_rn] + } + if (!nrow(wide)) { + warning("[long_to_matrix] Wide table is empty after cleaning.") + return(NULL) + } + + rownames(wide) <- make.unique(as.character(rn), sep = "_dup") + wide[[sample_col]] <- NULL + as.matrix(wide) +} + + + +# ---- PHOSPHO +.extract_gene_from_site <- function(site_id) { + if (is.na(site_id) || site_id == "") return(NA_character_) + x <- as.character(site_id) + + # Get chars before first '-' (e.g., "AAAS-S495s" -> "AAAS") + gene <- sub("^([^\\-]+)-.*$", "\\1", x, perl = TRUE) + + # If no '-' present, fall back to splitting on common delimiters + if (identical(gene, x)) { + parts <- strsplit(x, "[|:_\\-\\.]", fixed = FALSE)[[1]] + gene <- parts[1] + } + gene <- sub("^([A-Za-z0-9]+).*", "\\1", gene) + gene <- toupper(gene) + if (nchar(gene) == 0) return(NA_character_) + gene +} + +# Build phospho site gene map from long table +.build_phospho_gene_map_from_long <- function(df_long, feature_col) { + gene_cols <- c("Gene","gene","hgnc_id","hgnc_symbol","protein","Protein","Symbol","symbol") + has <- gene_cols[gene_cols %in% colnames(df_long)] + if (length(has)) { + gcol <- has[[1]] + mp <- df_long %>% + dplyr::select(!!rlang::sym(feature_col), !!rlang::sym(gcol)) %>% + dplyr::rename(site = !!rlang::sym(feature_col), gene = !!rlang::sym(gcol)) %>% + dplyr::mutate(site = trimws(as.character(site)), + gene = toupper(trimws(as.character(gene)))) %>% + dplyr::filter(!is.na(site), site != "", !is.na(gene), gene != "") %>% + dplyr::distinct(site, gene) + if (nrow(mp)) return(setNames(mp$gene, mp$site)) + } + sites <- unique(trimws(as.character(df_long[[feature_col]]))) + sites <- sites[!is.na(sites) & sites != ""] + if (!length(sites)) return(NULL) + genes <- vapply(sites, .extract_gene_from_site, FUN.VALUE = character(1)) + genes[genes == ""] <- NA_character_ + setNames(genes, sites) +} + + +.collapse_sites_to_genes <- function(cor_named_vec, map_site2gene, agg = c("mean","maxabs")) { + agg <- match.arg(agg) + if (is.null(map_site2gene) || !length(cor_named_vec)) return(cor_named_vec) + + # Align and drop unmapped + genes <- map_site2gene[names(cor_named_vec)] + keep <- !is.na(genes) & genes != "" + v <- cor_named_vec[keep] + g <- genes[keep] + if (!length(v)) return(setNames(numeric(0), character(0))) + + if (agg == "mean") { + # mean per gene + df <- tibble(gene = g, val = as.numeric(v)) %>% + group_by(gene) %>% summarise(val = mean(val, na.rm = TRUE), .groups = "drop") + out <- stats::setNames(df$val, df$gene) + } else { + # max by absolute value, keep sign + df <- tibble(gene = g, val = as.numeric(v)) %>% + mutate(ord = order(-abs(val))) %>% + group_by(gene) %>% + slice_max(order_by = abs(val), n = 1, with_ties = FALSE) %>% + ungroup() + out <- stats::setNames(df$val, df$gene) + } + out +} + +# ---- Normalize phosphosite IDs to match kinasesubstrates (e.g. "AAAS-S495s" -> "AAAS-S495") +.normalize_kinase_site_id <- function(x) { + x <- as.character(x) + x <- trimws(x) + # Drop trailing lowercase letters + sub("[a-z]+$", "", x) +} + +# Spearman correlations +.col_spearman <- function(vec, mat) { + shared <- intersect(names(vec), rownames(mat)) + if (length(shared) < 3) return(setNames(rep(NA_real_, ncol(mat)), colnames(mat))) + v <- vec[shared] + m <- as.matrix(mat[shared, , drop = FALSE]) + apply(m, 2, function(col) { + if (all(is.na(col))) return(NA_real_) + if (sd(col, na.rm = TRUE) == 0 || sd(v, na.rm = TRUE) == 0) return(NA_real_) + suppressWarnings(cor(v, col, method = "spearman", use = "pairwise.complete.obs")) + }) +} + +# Build SummarizedExperiment to feed into leapR +.build_se_from_corvec <- function(cor_named_vec, features_all, col_label, + map_to_gene = NULL, assay_label = "proteomics") { + v <- rep(NA_real_, length(features_all)); names(v) <- features_all + common <- intersect(names(cor_named_vec), features_all) + v[common] <- cor_named_vec[common] + mat <- matrix(v, nrow = length(v), ncol = 1, dimnames = list(features_all, col_label)) + rd <- S4Vectors::DataFrame(feature_id = features_all) + rd$hgnc_id <- if (is.null(map_to_gene)) features_all else map_to_gene[features_all] + se <- SummarizedExperiment::SummarizedExperiment( + assays = list(values = mat), + rowData = rd, + colData = S4Vectors::DataFrame(sample = col_label) + ) + SummarizedExperiment::assayNames(se) <- assay_label + se +} + +.safe_leapr <- function(...) { + tryCatch(leapR::leapR(...), + error = function(e) { message("[leapR] ", conditionMessage(e)); NULL }) +} + +# Load a leapR built-in geneset by name +.load_leapr_geneset_by_name <- function(name) { + valid <- c("kinasesubstrates", "ncipid", "krbpaths", "longlist", "shortlist") + if (!(name %in% valid)) { + stop("Unknown geneset name: '", name, "'. Valid: ", paste(valid, collapse = ", ")) + } + suppressWarnings(utils::data(list = name, package = "leapR", envir = environment())) + if (!exists(name, inherits = FALSE)) { + stop("leapR dataset '", name, "' not found in the installed {leapR}.") + } + get(name, inherits = FALSE) +} + +# Decide default geneset from omic label when no override is provided +.default_geneset_for_omic <- function(omic_label) { + ol <- tolower(omic_label) + if (ol %in% c("phospho","phosphoproteomics","phosphoprotein","phosphoproteome")) { + .load_leapr_geneset_by_name("kinasesubstrates") + } else { + .load_leapr_geneset_by_name("krbpaths") + } +} + +# ----------------------------- +# Main +# ----------------------------- +run_leapr_directional_one_cached <- function(drugs, + df_long, + sample_col, + feature_col, + value_col, + omic_label, + cache_path, + write_csvs = FALSE, + always_rerun = FALSE, + min_features = 5, + test_one = FALSE, + geneset_name = NULL, + geneset_object = NULL) { + + # cache check! If the cached value exists, stop there. + if (!always_rerun && is.character(cache_path) && nzchar(cache_path) && file.exists(cache_path)) { + load(cache_path) # loads res_list + if (exists("res_list")) return(res_list) + } + + # pivot long to matrix + feat_mat <- long_to_matrix(df_long, sample_col, feature_col, value_col) + if (is.null(feat_mat) || !nrow(feat_mat) || !ncol(feat_mat)) { + warning("[run_leapr_directional_one_cached] Empty feature matrix after pivot; returning empty list.") + return(list()) + } + + # pick geneset + if (!is.null(geneset_object)) { + geneset_db <- geneset_object + } else if (!is.null(geneset_name)) { + geneset_db <- .load_leapr_geneset_by_name(geneset_name) + } else { + geneset_db <- .default_geneset_for_omic(omic_label) + } + + # optional phospho site gene mapping + map_site2gene <- NULL + is_phospho <- tolower(omic_label) %in% c("phospho","phosphoproteomics","phosphoprotein","phosphoproteome") + if (is_phospho) { + map_site2gene <- .build_phospho_gene_map_from_long(df_long, feature_col) + if (is.null(map_site2gene) || !length(map_site2gene)) { + phos_features <- colnames(feat_mat) + map_site2gene <- setNames( + vapply(phos_features, .extract_gene_from_site, FUN.VALUE = character(1)), + phos_features + ) + } + # Print statements + feats <- colnames(feat_mat) + mapped <- map_site2gene[feats] + n_mapped <- sum(!is.na(mapped) & mapped != "") + message(sprintf("[phospho mapping] %d/%d sites mapped to gene symbols (%.1f%%)", + n_mapped, length(feats), 100 * n_mapped / max(1, length(feats)))) + if (n_mapped < length(feats)) { + unm <- feats[is.na(mapped) | mapped == ""] + if (length(unm)) { + show_n <- min(5L, length(unm)) + message("[phospho mapping] Unmapped examples: ", + paste(utils::head(unm, show_n), collapse = ", "), + if (length(unm) > show_n) paste0(" ... +", length(unm) - show_n, " more") else "") + } + } + } + + # For phospho, detect when we are using site-level kinase substrates + uses_kinase_sites <- is_phospho && { + if (!is.null(geneset_name)) { + identical(geneset_name, "kinasesubstrates") + } else { + identical(geneset_db, .load_leapr_geneset_by_name("kinasesubstrates")) + } + } + + # If using kinasesubstrates, normalize column names so they match site IDs + if (uses_kinase_sites) { + old_sites <- colnames(feat_mat) + norm_sites <- .normalize_kinase_site_id(old_sites) + if (!identical(old_sites, norm_sites)) { + message("[kinasesubstrates] Normalizing phosphosite IDs (e.g. 'AAAS-S495s' -> 'AAAS-S495')") + colnames(feat_mat) <- make.unique(norm_sites) + } + } + + res_list <- list() + out_csv_dir <- file.path("leapR_top_paths", "dir_split") + if (write_csvs && !dir.exists(out_csv_dir)) dir.create(out_csv_dir, recursive = TRUE) + + all_drugs <- unique(drugs$improve_drug_id) + if (test_one && length(all_drugs) > 0) { + message("[run_leapr_directional_one_cached] test_one=TRUE then running only the first drug: ", all_drugs[[1]]) + all_drugs <- all_drugs[[1]] + } else { + all_drugs <- sort(all_drugs) + } + + total <- length(all_drugs) + for (i in seq_along(all_drugs)) { + drug <- all_drugs[[i]] + message(sprintf("[%-3d/%-3d] %s", i, total, drug)) + + # mean response per sample for uM_viability + dv <- drugs %>% + dplyr::filter(.data$improve_drug_id == !!drug, + .data$dose_response_metric == "uM_viability") %>% + dplyr::group_by(.data$improve_sample_id) %>% + dplyr::summarise(resp = mean(.data$dose_response_value, na.rm = TRUE), + .groups = "drop") + + if (!nrow(dv)) { + message(" No response rows for metric 'uM_viability'; skipping.") + next + } + dv_vec <- stats::setNames(dv$resp, dv$improve_sample_id) + + # correlations at site-level (or normalized site-level for kinasesubstrates) + cors <- .col_spearman(dv_vec, feat_mat) + pos <- cors[!is.na(cors) & cors > 0] # resistant (TOP) + neg <- cors[!is.na(cors) & cors < 0] # sensitive (BOTTOM; flip) + message(sprintf(" Features (site-level): pos=%d, neg=%d (min_features=%d)", + length(pos), length(neg), min_features)) + + if (uses_kinase_sites && i == 1) { + ks_sites <- unique(unlist(geneset_db[["matrix"]])) + ov <- intersect(names(cors), ks_sites) + message("[kinasesubstrates] Overlapping sites with geneset (first drug): ", length(ov)) + if (length(ov)) { + message(" Example overlaps: ", paste(utils::head(ov, 5), collapse = ", ")) + } + } + + # For phospho + GENE-LEVEL sets, collapse site to gene before SE + if (is_phospho && !uses_kinase_sites) { + pos <- .collapse_sites_to_genes(pos, map_site2gene, agg = "mean") + neg <- .collapse_sites_to_genes(neg, map_site2gene, agg = "mean") + message(sprintf(" Gene-level: pos=%d, neg=%d", length(pos), length(neg))) + } + + res_list[[drug]] <- list(top = NULL, bottom = NULL) + + # TOP (resistant) + if (length(pos) >= min_features) { + feats_top <- names(pos) + se_top <- .build_se_from_corvec( + cor_named_vec = pos, + features_all = feats_top, + col_label = paste0(drug, "_TOP"), + map_to_gene = if (is_phospho) NULL else NULL, # not needed when features are genes/sites + assay_label = omic_label + ) + top_res <- .safe_leapr( + geneset = geneset_db, + enrichment_method = "enrichment_in_order", + eset = se_top, + assay_name = omic_label, + primary_columns = paste0(drug, "_TOP"), + id_column = NULL + ) + res_list[[drug]]$top <- top_res + message(" TOP (resistant): ", if (is.null(top_res)) "no result" else "OK") + if (write_csvs && !is.null(top_res)) { + utils::write.csv(as.data.frame(top_res), + file = file.path(out_csv_dir, paste0(drug, "_", omic_label, "_TOP.csv")), + row.names = FALSE) + } + } else { + message(" TOP (resistant): skipped (too few positive features)") + } + + # BOTTOM (sensitive) + if (length(neg) >= min_features) { + # flip sign so strong negatives rank to top + neg_flip <- -neg + feats_bot <- names(neg_flip) + se_bot <- .build_se_from_corvec( + cor_named_vec = neg_flip, + features_all = feats_bot, + col_label = paste0(drug, "_BOTTOM"), + map_to_gene = if (is_phospho) NULL else NULL, + assay_label = omic_label + ) + bot_res <- .safe_leapr( + geneset = geneset_db, + enrichment_method = "enrichment_in_order", + eset = se_bot, + assay_name = omic_label, + primary_columns = paste0(drug, "_BOTTOM"), + id_column = NULL + ) + res_list[[drug]]$bottom <- bot_res + message(" BOTTOM(sensitive): ", if (is.null(bot_res)) "no result" else "OK") + if (write_csvs && !is.null(bot_res)) { + utils::write.csv(as.data.frame(bot_res), + file = file.path(out_csv_dir, paste0(drug, "_", omic_label, "_BOTTOM.csv")), + row.names = FALSE) + } + } else { + message(" BOTTOM(sensitive): skipped (too few negative features)") + } + + if (isTRUE(test_one)) break + } + + if (is.character(cache_path) && nzchar(cache_path)) { + save(res_list, file = cache_path) + } + res_list +} + +# ----------------------------- +# Plot and save using leapR builtin plotter +# ----------------------------- +save_leapr_plots <- function(res_list, omic_label, top_n = 15) { + if (!length(res_list)) return(invisible(NULL)) + dir.create("figs", showWarnings = FALSE) + safelabel <- function(x) gsub("[^A-Za-z0-9_.-]", "_", x) + + for (drug in names(res_list)) { + two <- res_list[[drug]] + + # TOP (resistant) + if (!is.null(two$top)) { + p_top <- leapR::plot_leapr_bar(two$top, + title = paste0(drug, " — ", omic_label, " (Resistant)"), + top_n = top_n) + if (!is.null(p_top)) { + fn <- file.path("figs", paste0("pathways_", safelabel(drug), "_", omic_label, + "_resistant_top", top_n, ".pdf")) + ggplot2::ggsave(fn, p_top, width = 7, height = 5, device = grDevices::cairo_pdf) + } + } + + # BOTTOM (Sensitive) + if (!is.null(two$bottom)) { + p_bot <- leapR::plot_leapr_bar(two$bottom, + title = paste0(drug, " — ", omic_label, " (Sensitive)"), + top_n = top_n) + if (!is.null(p_bot)) { + fn <- file.path("figs", paste0("pathways_", safelabel(drug), "_", omic_label, + "_sensitive_top", top_n, ".pdf")) + ggplot2::ggsave(fn, p_bot, width = 7, height = 5, device = grDevices::cairo_pdf) + } + } + } + invisible(NULL) +} diff --git a/cNF_helper_code.R b/cNF_helper_code.R index 0670c6c..711858a 100644 --- a/cNF_helper_code.R +++ b/cNF_helper_code.R @@ -1,3 +1,4 @@ +#cNF_helper_code.R ##standard metadata across all cNFs, including colors if possible From 6737340066e9b7dc7d095e051b92691d50f743f0 Mon Sep 17 00:00:00 2001 From: Jeremy Date: Mon, 24 Nov 2025 14:46:09 -0800 Subject: [PATCH 02/13] clean up --- 02_normalize_harmonize_proteomics.html | 841 ---------------- 02_normalize_harmonize_proteomics.rmd | 527 ---------- 03_drug_biomarkers_legacy_code.Rmd | 1265 ++++++++++++++++++++++++ cNF_Analysis_and_fig_descriptions.docx | Bin 0 -> 27471 bytes 4 files changed, 1265 insertions(+), 1368 deletions(-) delete mode 100644 02_normalize_harmonize_proteomics.html delete mode 100644 02_normalize_harmonize_proteomics.rmd create mode 100644 03_drug_biomarkers_legacy_code.Rmd create mode 100644 cNF_Analysis_and_fig_descriptions.docx diff --git a/02_normalize_harmonize_proteomics.html b/02_normalize_harmonize_proteomics.html deleted file mode 100644 index 803d592..0000000 --- a/02_normalize_harmonize_proteomics.html +++ /dev/null @@ -1,841 +0,0 @@ - - - - - - - - - - - - - - - -Reformat and process proteomics - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - -
-

Normalize phospho-proteomics

-

We now have phospho proteomics from two cohorts. Here I’m trying to -collect data from both and normalize but am clearly missing something. I -do the following:

-
    -
  1. replace all zero values with NA to avoid skewing normalization
  2. -
  3. remove any features that are absent from >50% of the samples
  4. -
  5. take the log of the data, then take a modified z score
  6. -
-

Each dataset is done individually then combined at the end. There is -a clear batch effect.

-
-

Cohort 1 phospho

-

We start with the cohort 1 phospho data here.

-
##cohort 1 phospho
-##first we read in file, and get site info
-phospho1<- read.table(syn$get('syn69963552')$path,sep='\t',fill=NA,header=T,quote='"') |>
-  subset(!is.na(`Gene.Names`)) |>
-  subset(Gene.Names!='') |>
-  mutate(lsite=tolower(Residue)) |>
-  tidyr::unite(c(Residue,Site,lsite),col='site',sep='') |>
-  tidyr::unite(c(`Gene.Names`,site),col='site',sep='-') |>
-  as.data.frame()
-
-phospho1[which(phospho1==0,arr.ind=TRUE)]<-NA
-
-pfnames1 <- data.frame(fname=colnames(phospho1)[5:ncol(phospho1)])|>
-  mutate(aliquot=sapply(fname,function(x) unlist(strsplit(x,split='_'))[8]))|>
-  mutate(aliquot=as.double(aliquot))|>
-  mutate(cohort=1)
- 
-##logtransform##median transform
-pzeros<-which(apply(phospho1[,5:ncol(phospho1)],1,function(x)
-    length(which(is.na(x)))/length(x) < 0.5))
-
-pmat1<-apply(0.01+log2(phospho1[pzeros,5:ncol(phospho1)]),2,
-             function(x) {0.6745 *(x-median(x,na.rm=T))/mad(x,na.rm=T)}) |>
-  as.data.frame() |>
-  mutate(site=phospho1$site[pzeros])
-
-##move to long form, upload
-plong1<-pmat1|>
-  tidyr::pivot_longer(1:(ncol(pmat1)-1),names_to='fname',values_to='abundance')|>
-  left_join(pfnames1) |>
-  group_by(site,fname,aliquot,cohort) |>
-  summarize(meanAbundance=mean(abundance,na.rm=T))|>
-  subset(!is.na(meanAbundance))|>
-  left_join(meta)
-
## Joining with `by = join_by(fname)`
-## `summarise()` has grouped output by 'site', 'fname', 'aliquot'. You can
-## override using the `.groups` argument.
-## Joining with `by = join_by(aliquot, cohort)`
-
readr::write_csv(plong1,file='log2normMedCenteredPhospho.csv')
-syn$store(File('log2normMedCenteredPhospho.csv',parentId='syn70078365'))
-
## File(id='syn65598472', synapseStore=True, modifiedOn='2025-10-07T16:10:10.287Z', dataFileHandleId='163828412', versionNumber=51, name='log2normMedCenteredPhospho.csv', createdBy='1418096', parentId='syn70078365', path='log2normMedCenteredPhospho.csv', _file_handle={'id': '163828412', 'etag': 'c532fccc-538b-4345-85ee-78a55c4db07b', 'createdBy': '1418096', 'createdOn': '2025-10-02T01:31:03.000Z', 'modifiedOn': '2025-10-02T01:31:03.000Z', 'concreteType': 'org.sagebionetworks.repo.model.file.S3FileHandle', 'contentType': 'text/csv', 'contentMd5': '479eaaac2d8f26d5ef94fbec8a7d4c6d', 'fileName': 'log2normMedCenteredPhospho.csv', 'storageLocationId': 1, 'contentSize': 11566930, 'status': 'AVAILABLE', 'bucketName': 'proddata.sagebase.org', 'key': '1418096/aff2a055-c32f-4a5c-bc5b-421908f76e6b/log2normMedCenteredPhospho.csv', 'previewId': '163828414', 'isPreview': False, 'externalURL': None}, concreteType='org.sagebionetworks.repo.model.FileEntity', isLatestVersion=True, cacheDir='', etag='0fc57caf-2cbf-4e7e-88a1-d4fce206142e', files=['log2normMedCenteredPhospho.csv'], modifiedBy='1418096', versionLabel='51', createdOn='2025-03-20T19:21:02.864Z')
-

The file is uploaded to synapse.

-
-
-

Cohort 2 phospho

-

Now on October 7 we can process the second batch of phospho.

-
##cohort 2 phospho
-##1 read in data
-phospho2 <- read.table(syn$get('syn69947351')$path,sep='\t',fill=NA,header=T,quote='"') |>
-  subset(!is.na(`Gene.Names`)) |>
-  subset(Gene.Names!='') |>
-  mutate(lsite=tolower(Residue)) |>
-  tidyr::unite(c(Residue,Site,lsite),col='site',sep='') |>
-  tidyr::unite(c(`Gene.Names`,site),col='site',sep='-')
-
-
-phospho2[which(phospho2==0,arr.ind=TRUE)] <- NA
-
-pfnames2 <- data.frame(fname=colnames(phospho2)[5:ncol(phospho2)]) |>
-  mutate(aliquot=sapply(fname,function(x) unlist(strsplit(x,split='_'))[9])) |>
-  mutate(aliquot=as.double(aliquot))|>
-  mutate(cohort=2)
-
-##remove missingness 
-tm <- which(apply(phospho2[,5:ncol(phospho2)],1,function(x) length(which(is.na(x)))/length(x) < 0.5))
-
-##log2 adjusted z score
-pmat2<-apply(log2(0.01+phospho2[tm,5:ncol(phospho2)]),2,
-              function(x) {0.6745 *(x-median(x,na.rm=T))/mad(x,na.rm=T)}) |>
-  as.data.frame() |>
-  mutate(site=phospho2$site[tm])
-
-
-
-plong2<-pmat2|>
-  tidyr::pivot_longer(1:(ncol(pmat2)-1),names_to='fname',values_to='abundance') |>
-  left_join(pfnames2)|>
-  group_by(site,fname,aliquot,cohort) |>
-  summarize(meanAbundance=mean(abundance,na.rm=T)) |>
-  subset(!is.na(meanAbundance))|>
-  left_join(meta)
-
## Joining with `by = join_by(fname)`
-## `summarise()` has grouped output by 'site', 'fname', 'aliquot'. You can
-## override using the `.groups` argument.
-## Joining with `by = join_by(aliquot, cohort)`
-
##save to file
-readr::write_csv(plong2,file='log2normMedCenteredPhospho_cohort2.csv')
-syn$store(File('log2normMedCenteredPhospho_cohort2.csv',parentId='syn70078365'))
-
## File(synapseStore=True, id='syn70078413', path='log2normMedCenteredPhospho_cohort2.csv', createdOn='2025-10-07T15:56:04.262Z', createdBy='1418096', files=['log2normMedCenteredPhospho_cohort2.csv'], _file_handle={'id': '163828418', 'etag': '1d896b0a-e1eb-4083-a6e0-b3f266dc59ce', 'createdBy': '1418096', 'createdOn': '2025-10-02T01:31:22.000Z', 'modifiedOn': '2025-10-02T01:31:22.000Z', 'concreteType': 'org.sagebionetworks.repo.model.file.S3FileHandle', 'contentType': 'text/csv', 'contentMd5': '84b9a91da42f0630ebbd37e0ad9b22a7', 'fileName': 'log2normMedCenteredPhospho_cohort2.csv', 'storageLocationId': 1, 'contentSize': 6535765, 'status': 'AVAILABLE', 'bucketName': 'proddata.sagebase.org', 'key': '1418096/b3ab724b-02f7-4b08-8ea0-0191c2835ee6/log2normMedCenteredPhospho_cohort2.csv', 'previewId': '163828419', 'isPreview': False, 'externalURL': None}, name='log2normMedCenteredPhospho_cohort2.csv', modifiedOn='2025-10-07T16:10:11.331Z', dataFileHandleId='163828418', etag='755cc7b3-9fae-473e-b684-9e81d4200f95', concreteType='org.sagebionetworks.repo.model.FileEntity', versionNumber=3, isLatestVersion=True, cacheDir='', versionLabel='3', parentId='syn70078365', modifiedBy='1418096')
-

Now that we have two cohorts we can try to combine without batch -correction.

-
-
-

Combined phospho

-

Combining the phoshpo data here.

-
##now we move back to long form
-plong <- rbind(plong1,plong2)
-  #pmat |>
-#  as.data.frame()|>
-#  tibble::rownames_to_column('site')|>
-#  pivot_longer(-site,names_to='fname',values_to='abundance')|>
-#  left_join(rbind(pfnames1,pfnames2))|>
-#    group_by(site,fname,aliquot,cohort) |>
-#  summarize(meanAbundance=mean(abundance,na.rm=T)) |>
-#  left_join(meta)
-
-         
-compsites <- plong|>
-#  subset(meanAbundance>(-5))|>
-  group_by(site)|>
-  summarize(spec = n_distinct(Specimen))|>
-  subset(spec==31)
-
-#plong$meanAbundance[which(!is.finite(plong$meanAbundance))]<-0
-
-ppcs<-plong|>ungroup()|>
-  dplyr::select(Specimen,meanAbundance,site)|>
-  unique()|>
-  subset(site%in%compsites$site)|>
-  #subset(!is.na(site))|>
-  #subset(!is.na(meanAbundance))|>
-  tidyr::pivot_wider(names_from='Specimen',values_from='meanAbundance',
-                     values_fn=mean,values_fill=0)|>
-  tibble::column_to_rownames('site')|>
-  t()|>
-  prcomp()
-
-pplot<-ppcs$x|>
-  as.data.frame()|>
-  dplyr::select(PC1,PC2,PC3)|>
-  tibble::rownames_to_column('Specimen')|>
-  left_join(meta)|>
-  dplyr::select(PC2,PC1,Specimen,Patient,cohort)|>
-  mutate(cohort=as.factor(cohort))|>
-  distinct()|>
-  ggplot(aes(x=PC1,y=PC2,label=Specimen,col=Patient,shape=cohort))+
-    geom_point()+
-    #ggrepel::geom_label_repel()+
-    ggtitle("Phospho samples")+
-  ggplot2::scale_color_manual(values=pcols)
-
## Joining with `by = join_by(Specimen)`
-
ph<- plong |>ungroup()|>
-  subset(site%in%compsites$site)|>
-  ggplot(aes(x=meanAbundance,fill=as.factor(cohort)))+geom_histogram()
-
-cowplot::plot_grid(ph,pplot)
-
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
-

-
ggsave('cNFPhosphoQC.png',width=10)
-
## Saving 10 x 5 in image
-
pplot
-

-
ggsave('phosphoPCA.pdf')
-
## Saving 7 x 5 in image
-

Clearly there is a strong batch effect.

-
-
-
-

Normalize golbal proteomics

-

Now we can move onto the global data

-
-

Cohort 1 global

-

Global proteomics in cohort 1 here.

-
####now process global
-#global1<-readr::read_tsv(syn$get('syn64906445')$path)
-global1 <- read.table(syn$get('syn69947355')$path,sep='\t',header=T,quote='"') |>
-  tidyr::separate_rows(Genes,sep=';')
-##logtransform, medina transform
-
-#global1[which(global1==0,arr.ind=TRUE)]<-NA
-
-gmat1<-apply(log2(global1[,5:ncol(global1)]),2,function(x) 0.6745 *(x-median(x,na.rm=T))/mad(x,na.rm=T))
-
-gmat1<-gmat1|>
-  as.data.frame()|>
-  mutate(Genes=global1$Genes)
-
-##extract aliquot info from file name
-gfnames1 <- data.frame(fname=colnames(global1)[5:ncol(global1)]) |>
-  mutate(aliquot=sapply(fname,function(x) unlist(strsplit(x,split='_'))[6])) |>
-  mutate(aliquot=as.double(aliquot))|>
-  mutate(cohort=1)
-
-glong1<-gmat1|>
-    tidyr::pivot_longer(1:(ncol(gmat1)-1),names_to='fname',values_to='abundance')|>
- left_join(gfnames1)|>
-   group_by(Genes,fname,aliquot,cohort)|>
-  summarize(meanAbundance=mean(abundance))|>
-  subset(is.finite(meanAbundance))|>
-  left_join(meta)
-
## Joining with `by = join_by(fname)`
-## `summarise()` has grouped output by 'Genes', 'fname', 'aliquot'. You can
-## override using the `.groups` argument.
-## Joining with `by = join_by(aliquot, cohort)`
-
readr::write_csv(glong1,file='log2normMedCenteredGlobal.csv')
-syn$store(File('log2normMedCenteredGlobal.csv',parentId='syn70078365'))
-
## File(files=['log2normMedCenteredGlobal.csv'], synapseStore=True, name='log2normMedCenteredGlobal.csv', _file_handle={'id': '163828754', 'etag': 'ecb596c9-4869-4413-9838-eb66cb83b76b', 'createdBy': '1418096', 'createdOn': '2025-10-02T02:09:54.000Z', 'modifiedOn': '2025-10-02T02:09:54.000Z', 'concreteType': 'org.sagebionetworks.repo.model.file.S3FileHandle', 'contentType': 'text/csv', 'contentMd5': 'd6c01c9ff6bf97322703ae1d5bbc8349', 'fileName': 'log2normMedCenteredGlobal.csv', 'storageLocationId': 1, 'contentSize': 20723175, 'status': 'AVAILABLE', 'bucketName': 'proddata.sagebase.org', 'key': '1418096/75b0981e-7e42-4c6d-b5aa-1443652f6c43/log2normMedCenteredGlobal.csv', 'previewId': '163828755', 'isPreview': False, 'externalURL': None}, etag='f8f7705a-9072-42f8-9616-31c447e8f787', createdBy='1418096', path='log2normMedCenteredGlobal.csv', createdOn='2025-03-20T19:29:54.101Z', modifiedOn='2025-10-07T16:10:14.595Z', concreteType='org.sagebionetworks.repo.model.FileEntity', versionLabel='43', isLatestVersion=True, cacheDir='', dataFileHandleId='163828754', parentId='syn70078365', id='syn65599827', versionNumber=43, modifiedBy='1418096')
-
-
-

Cohort 2 global

-

October 7 we process the second cohort.

-
global2<-read.table(syn$get('syn69947352')$path,header=T,sep='\t',quote='"')|>
-  tidyr::separate_rows(Genes,sep=';')
-
-#global2[which(global2==0,arr.ind=TRUE)]<-NA
-
-gmat2<-apply(log2(global2[,5:ncol(global2)]),2,function(x) 
-  0.6745 *(x-median(x,na.rm=T))/mad(x,na.rm=T))
-rownames(gmat2)<-global2$Genes
-
-gmat2<-gmat2|>
-  as.data.frame()|>
-  mutate(Genes=global2$Genes)
-
-gfnames2 <- data.frame(fname=colnames(global2)[5:ncol(global2)]) |>
-  mutate(aliquot=sapply(fname,function(x) unlist(strsplit(x,split='_'))[7])) |>
-  mutate(aliquot=as.double(aliquot))|>
-  mutate(cohort=2)
-
-glong2<-gmat2|>
-  tidyr::pivot_longer(1:(ncol(gmat2)-1),names_to='fname',values_to='abundance')|>
-  left_join(gfnames2)|>
-  group_by(Genes,fname,aliquot,cohort)|>
-  summarize(meanAbundance=mean(abundance))|>
-  subset(is.finite(meanAbundance))|>
-  left_join(meta)
-
## Joining with `by = join_by(fname)`
-## `summarise()` has grouped output by 'Genes', 'fname', 'aliquot'. You can
-## override using the `.groups` argument.
-## Joining with `by = join_by(aliquot, cohort)`
-
#dupes<-global|>group_by(Genes)|>summarize(numIso=n())|>
-#  subset(numIso>1)
-
-
-readr::write_csv(glong2,file='log2normMedCenteredGlobal_cohort2.csv')
-syn$store(File('log2normMedCenteredGlobal_cohort2.csv',parentId='syn70078365'))
-
## File(synapseStore=True, files=['log2normMedCenteredGlobal_cohort2.csv'], createdBy='1418096', dataFileHandleId='163828776', createdOn='2025-10-07T15:56:09.318Z', parentId='syn70078365', modifiedOn='2025-10-07T16:10:16.010Z', name='log2normMedCenteredGlobal_cohort2.csv', etag='d11f86af-5af5-49f7-abf6-2bae20de1c9b', concreteType='org.sagebionetworks.repo.model.FileEntity', versionNumber=3, isLatestVersion=True, _file_handle={'id': '163828776', 'etag': 'ea261f85-be11-4c15-b745-fe79324c9b19', 'createdBy': '1418096', 'createdOn': '2025-10-02T02:12:01.000Z', 'modifiedOn': '2025-10-02T02:12:01.000Z', 'concreteType': 'org.sagebionetworks.repo.model.file.S3FileHandle', 'contentType': 'text/csv', 'contentMd5': '4da78f673cfedc27c22cdb76a0663db8', 'fileName': 'log2normMedCenteredGlobal_cohort2.csv', 'storageLocationId': 1, 'contentSize': 13474091, 'status': 'AVAILABLE', 'bucketName': 'proddata.sagebase.org', 'key': '1418096/6c66fd73-e5d0-4f05-acc1-2aca33440948/log2normMedCenteredGlobal_cohort2.csv', 'previewId': '163828779', 'isPreview': False, 'externalURL': None}, cacheDir='', versionLabel='3', id='syn70078414', modifiedBy='1418096', path='log2normMedCenteredGlobal_cohort2.csv')
-
-
-

Global combined without batch correction

-

Now we can combine the global withot batch correction.

-
#ma<-mean(glong$abundance,na.rm=T)
-#glong$meanAbundance[which(!is.finite(glong$meanAbundance))]<-0
-glong <- rbind(glong1,glong2)|>
-  subset(Genes!="")
-      
-compsites <- glong|>
-#  subset(meanAbundance>(-5))|>
-  group_by(Genes)|>
-  summarize(spec = n_distinct(Specimen))|>
-  subset(spec==31)
-
-gpcs<-glong|>ungroup()|>
-  dplyr::select(Specimen,meanAbundance,Genes)|>
-  subset(!is.na(Genes))|>
-  subset(Genes!="")|>
-  subset(Genes%in%compsites$Genes)|>
-  subset(!is.na(meanAbundance))|>
-  tidyr::pivot_wider(names_from='Specimen',values_from='meanAbundance',values_fn=mean,values_fill=0)|>
-  tibble::column_to_rownames('Genes')|>t()|>
-  prcomp()
-
-gplot<-gpcs$x|>
-  as.data.frame()|>
-  dplyr::select(PC1,PC2)|>
-  tibble::rownames_to_column('Specimen')|>
-  left_join(meta)|>
-    dplyr::select(PC1,PC2,Specimen,Patient,cohort)|>
-    mutate(cohort=as.factor(cohort))|>
-  distinct()|>
-  ggplot(aes(x=PC1,y=PC2,label=Specimen,col=Patient,shape=cohort))+
-  geom_point()+ggrepel::geom_label_repel()+ggtitle("Global samples")+
-  scale_color_manual(values=pcols)
-
## Joining with `by = join_by(Specimen)`
-
hplot <- ggplot(glong,aes(x=meanAbundance,fill=as.factor(cohort)))+geom_histogram()
-     
-
-cowplot::plot_grid(hplot,gplot)
-
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
-

-
ggsave('cNFGlobalQC.png',width=10)
-
## Saving 10 x 5 in image
-
gplot
-

-
ggsave('globalPCA.pdf')
-
## Saving 7 x 5 in image
-
-
-
-

Evaluate batch correction

-

Now we have two separate long tables with metadata, but we would like -to combine into a single one and batch correct.We can update this with -each cohort.

-
##phospho 
-##TODO: ideally we should use the long tables and reconvert
-pmat <- merge(as.data.frame(pmat1),as.data.frame(pmat2))
-
-gmat <- merge(gmat1,gmat2)
-
-##remove duplicated sites
-dsites<- unique(pmat$site[which(duplicated(pmat$site))])
-mvals<-sapply(dsites,function(x) colSums(pmat[pmat$site==x,2:ncol(pmat)])) |>
-  t() |>
-  as.data.frame() |>
-  tibble::rownames_to_column('site')
-
-pmat <- pmat |>
-  subset(!site %in% dsites) |>
-  rbind(mvals)
-
-##now convert to matrix
-pmat <- pmat |>
-  tibble::remove_rownames() |>
-  tibble::column_to_rownames('site') |>
-  as.matrix()
-
-gmat <- gmat |>
-  subset(Genes!='')|>
-    tibble::remove_rownames() |>
-  tibble::column_to_rownames('Genes') |>
-    as.matrix()
-##sigh, batch correct?
-library(sva)
-
## Loading required package: mgcv
-
## Loading required package: nlme
-
## 
-## Attaching package: 'nlme'
-
## The following object is masked from 'package:dplyr':
-## 
-##     collapse
-
## This is mgcv 1.9-3. For overview type 'help("mgcv-package")'.
-
## Loading required package: genefilter
-
## Loading required package: BiocParallel
-
## Warning: package 'BiocParallel' was built under R version 4.4.3
-
  pmat[which(!is.finite(pmat),arr.ind=T)] <- 0.0
-  cbmat<-sva::ComBat(pmat,batch=meta$cohort,mean.only = FALSE)
-
## Found2batches
-
## Adjusting for0covariate(s) or covariate level(s)
-
## Standardizing Data across genes
-
## Fitting L/S model and finding priors
-
## Finding parametric adjustments
-
## Adjusting the Data
-
  gmat[which(!is.finite(gmat),arr.ind=T)] <- 0.0
-  cgmat <- sva::ComBat(gmat,batch=meta$cohort,mean.only = FALSE)
-
## Found2batches
-
## Adjusting for0covariate(s) or covariate level(s)
-
## Standardizing Data across genes
-
## Fitting L/S model and finding priors
-
## Finding parametric adjustments
-
## Adjusting the Data
-
 ppcs<-prcomp(t(cbmat))
- 
- pplot<-ppcs$x|>
-  as.data.frame()|>
-  dplyr::select(PC1,PC2)|>
-  tibble::rownames_to_column('fname')|>
-   left_join(rbind(pfnames1,pfnames2))|>
-  left_join(meta)|>
-    dplyr::select(PC1,PC2,Specimen,Patient,cohort)|>
-    mutate(cohort=as.factor(cohort))|>
-  distinct()|>
-  ggplot(aes(x=PC1,y=PC2,label=Specimen,col=Patient,shape=cohort))+
-  geom_point()+ggrepel::geom_label_repel()+ggtitle("Corrected phospho samples")+
-  scale_color_manual(values=pcols)
-
## Joining with `by = join_by(fname)`
-
## Joining with `by = join_by(aliquot, cohort)`
-
 pplot
-
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
-## increasing max.overlaps
-

-
 gpcs<-prcomp(t(cgmat))
- gplot<-gpcs$x|>
-  as.data.frame()|>
-  dplyr::select(PC1,PC2)|>
-  tibble::rownames_to_column('fname')|>
-   left_join(rbind(gfnames1,gfnames2))|>
-  left_join(meta)|>
-    dplyr::select(PC1,PC2,Specimen,Patient,cohort)|>
-    mutate(cohort=as.factor(cohort))|>
-  distinct()|>
-  ggplot(aes(x=PC1,y=PC2,label=Specimen,col=Patient,shape=cohort))+
-  geom_point()+ggrepel::geom_label_repel()+ggtitle("Corrected global samples")+
-  scale_color_manual(values=pcols)
-
## Joining with `by = join_by(fname)`
-## Joining with `by = join_by(aliquot, cohort)`
-
 gplot
-
## Warning: ggrepel: 11 unlabeled data points (too many overlaps). Consider
-## increasing max.overlaps
-

-
-
-

Upload batch-corrected data to synapse

-

Now we can reformat the batch-corrected data and upload to syanps

-
pc_long <- cbmat |>
-  as.data.frame() |>
-    tibble::rownames_to_column('site') |>
-    pivot_longer(-site,names_to = 'fname',values_to = 'correctedAbundance') |>
-  left_join(rbind(pfnames1,pfnames2)) |>
-  left_join(meta) |>
-  distinct()
-
## Joining with `by = join_by(fname)`
-## Joining with `by = join_by(aliquot, cohort)`
-
gc_long <- cgmat |>
-   as.data.frame() |>
-    tibble::rownames_to_column('Gene') |>
-    pivot_longer(-Gene,names_to = 'fname',values_to = 'correctedAbundance') |>
-  left_join(rbind(gfnames1,gfnames2)) |>
-  left_join(meta) |>
-  distinct()
-
## Joining with `by = join_by(fname)`
-## Joining with `by = join_by(aliquot, cohort)`
-
readr::write_csv(pc_long,file='batch12_correctedPhospho.csv')
-readr::write_csv(gc_long,file='batch12_correctedGlobal.csv')
-
-syn$store(File('batch12_correctedPhospho.csv',parentId='syn70078365'))
-
## File(modifiedOn='2025-10-07T16:10:24.600Z', synapseStore=True, etag='4ea27841-ef8e-4944-9a83-56d5ece5a31b', createdBy='1418096', path='batch12_correctedPhospho.csv', files=['batch12_correctedPhospho.csv'], id='syn70078415', parentId='syn70078365', concreteType='org.sagebionetworks.repo.model.FileEntity', versionNumber=3, isLatestVersion=True, cacheDir='', name='batch12_correctedPhospho.csv', versionLabel='3', createdOn='2025-10-07T15:56:24.984Z', modifiedBy='1418096', _file_handle={'id': '163984316', 'etag': '634ec130-4da8-4401-9731-511b589704b8', 'createdBy': '1418096', 'createdOn': '2025-10-07T15:56:25.000Z', 'modifiedOn': '2025-10-07T15:56:25.000Z', 'concreteType': 'org.sagebionetworks.repo.model.file.S3FileHandle', 'contentType': 'text/csv', 'contentMd5': 'df202d20b8b8bcee6b0fbf1a7a712f37', 'fileName': 'batch12_correctedPhospho.csv', 'storageLocationId': 1, 'contentSize': 17060666, 'status': 'AVAILABLE', 'bucketName': 'proddata.sagebase.org', 'key': '1418096/167bfcda-58b7-43d4-9457-4238fdae1118/batch12_correctedPhospho.csv', 'previewId': '163984318', 'isPreview': False, 'externalURL': None}, dataFileHandleId='163984316')
-
syn$store(File('batch12_correctedGlobal.csv',parentId='syn70078365'))
-
## File(synapseStore=True, createdBy='1418096', files=['batch12_correctedGlobal.csv'], name='batch12_correctedGlobal.csv', _file_handle={'id': '163984320', 'etag': 'f63e08df-542c-4642-a0f8-5314cda43a43', 'createdBy': '1418096', 'createdOn': '2025-10-07T15:56:29.000Z', 'modifiedOn': '2025-10-07T15:56:29.000Z', 'concreteType': 'org.sagebionetworks.repo.model.file.S3FileHandle', 'contentType': 'text/csv', 'contentMd5': '27e185221a058dafc833d1c2a52e74c1', 'fileName': 'batch12_correctedGlobal.csv', 'storageLocationId': 1, 'contentSize': 33970670, 'status': 'AVAILABLE', 'bucketName': 'proddata.sagebase.org', 'key': '1418096/c5da77ae-82a3-41a1-9045-d66fa87085a7/batch12_correctedGlobal.csv', 'previewId': '163984321', 'isPreview': False, 'externalURL': None}, id='syn70078416', createdOn='2025-10-07T15:56:29.301Z', concreteType='org.sagebionetworks.repo.model.FileEntity', cacheDir='', versionNumber=3, versionLabel='3', parentId='syn70078365', path='batch12_correctedGlobal.csv', modifiedBy='1418096', etag='935bbae3-5660-463b-b659-3d4a64de414b', isLatestVersion=True, modifiedOn='2025-10-07T16:10:25.266Z', dataFileHandleId='163984320')
-
- - - - -
- - - - - - - - - - - - - - - diff --git a/02_normalize_harmonize_proteomics.rmd b/02_normalize_harmonize_proteomics.rmd deleted file mode 100644 index aa2f322..0000000 --- a/02_normalize_harmonize_proteomics.rmd +++ /dev/null @@ -1,527 +0,0 @@ ---- -title: "Reformat and process proteomics" -author: "Sara Gosline" -date: "2025-09-28" -output: html_document ---- - - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -library(ggplot2) -library(dplyr) -library(tidyr) -library(stringr) -library(synapser) -library(grid) -source('cNF_helper_code.R') -``` - -# Drop sample list. These were for protocol optimization -```{r} -remove_fname_substrings <- c( - "cNF_organoid_DIA_G_02_11Feb25", - "cNF_organoid_DIA_G_05_11Feb25", - "cNF_organoid_DIA_G_06_11Feb25", - "cNF_organoid_DIA_P_02_29Jan25", - "cNF_organoid_DIA_P_05_11Feb25", - "cNF_organoid_DIA_P_06_11Feb25" -) - -is_unwanted_fname <- function(x) { - vapply(x, function(s) - any(vapply(remove_fname_substrings, function(p) grepl(p, s, fixed = TRUE), logical(1))), - logical(1) - ) -} - - -``` - - - -## Normalize phospho-proteomics - -We now have phosphoproteomics from two cohorts. Here I'm trying to collect data from both and normalize but am clearly missing something. I do the following: - -1. replace all zero values with NA to avoid skewing normalization -2. remove any features that are absent from >50% of the samples -3. take the log of the data, then take a modified z score - -Each dataset is done individually then combined at the end. There is a clear batch effect. - -### Cohort 1 phospho - -We start with the cohort 1 phospho data here. - -```{r compare to proteomics,warning=FALSE} - -##cohort 1 phospho -##first we read in file, and get site info -phospho1<- read.table(syn$get('syn69963552')$path,sep='\t',fill=NA,header=T,quote='"') |> - subset(!is.na(`Gene.Names`)) |> - subset(Gene.Names!='') |> - mutate(lsite=tolower(Residue)) |> - tidyr::unite(c(Residue,Site,lsite),col='site',sep='') |> - tidyr::unite(c(`Gene.Names`,site),col='site',sep='-') |> - as.data.frame() - -phospho1[which(phospho1==0,arr.ind=TRUE)]<-NA - -pfnames1 <- data.frame(fname=colnames(phospho1)[5:ncol(phospho1)])|> - mutate(aliquot=sapply(fname,function(x) unlist(strsplit(x,split='_'))[8]))|> - mutate(aliquot=as.double(aliquot))|> - mutate(cohort=1) |> - dplyr::filter(!is_unwanted_fname(fname)) - - - -##logtransform##median transform -pzeros<-which(apply(phospho1[,5:ncol(phospho1)],1,function(x) - length(which(is.na(x)))/length(x) < 0.5)) - -pmat1<-apply(0.01+log2(phospho1[pzeros,5:ncol(phospho1)]),2, - function(x) {0.6745 *(x-median(x,na.rm=T))/mad(x,na.rm=T)}) |> - as.data.frame() |> - mutate(site=phospho1$site[pzeros]) - -##move to long form, upload -plong1<-pmat1|> - tidyr::pivot_longer(1:(ncol(pmat1)-1),names_to='fname',values_to='abundance')|> - left_join(pfnames1) |> - group_by(site,fname,aliquot,cohort) |> - summarize(meanAbundance=mean(abundance,na.rm=T))|> - subset(!is.na(meanAbundance))|> - left_join(meta) - -readr::write_csv(plong1,file='log2normMedCenteredPhospho.csv') -syn$store(File('log2normMedCenteredPhospho.csv',parentId='syn70078365')) - -``` -The file is uploaded to synapse. - -### Cohort 2 phospho - -Now on October 7 we can process the second batch of phospho. - -```{r cohort 2 phospho} - -##cohort 2 phospho -##1 read in data -phospho2 <- read.table(syn$get('syn69947351')$path,sep='\t',fill=NA,header=T,quote='"') |> - subset(!is.na(`Gene.Names`)) |> - subset(Gene.Names != '') |> - mutate(lsite = tolower(Residue)) |> - tidyr::unite(c(Residue,Site,lsite),col = 'site',sep='') |> - tidyr::unite(c(`Gene.Names`,site),col = 'site',sep='-') - - -phospho2[which(phospho2==0,arr.ind=TRUE)] <- NA - -pfnames2 <- data.frame(fname=colnames(phospho2)[5:ncol(phospho2)]) |> - mutate(aliquot=sapply(fname,function(x) unlist(strsplit(x,split='_'))[9])) |> - mutate(aliquot=as.double(aliquot))|> - mutate(cohort=2)|> - dplyr::filter(!is_unwanted_fname(fname)) - -##remove missingness -tm <- which(apply(phospho2[,5:ncol(phospho2)],1,function(x) length(which(is.na(x)))/length(x) < 0.5)) - -##log2 adjusted z score -pmat2<-apply(log2(0.01+phospho2[tm,5:ncol(phospho2)]),2, - function(x) {0.6745 *(x-median(x,na.rm=T))/mad(x,na.rm=T)}) |> - as.data.frame() |> - mutate(site=phospho2$site[tm]) - - - -plong2<-pmat2|> - tidyr::pivot_longer(1:(ncol(pmat2)-1),names_to='fname',values_to='abundance') |> - left_join(pfnames2)|> - group_by(site,fname,aliquot,cohort) |> - summarize(meanAbundance=mean(abundance,na.rm=T)) |> - subset(!is.na(meanAbundance))|> - left_join(meta) - -##save to file -readr::write_csv(plong2,file='log2normMedCenteredPhospho_cohort2.csv') -syn$store(File('log2normMedCenteredPhospho_cohort2.csv',parentId='syn70078365')) - - -``` - -Now that we have two cohorts we can try to combine without batch correction. - -### Combined phospho -Combining the phoshpo data here. - -```{r combined phospho} - -##now we move back to long form -# plong <- rbind(plong1,plong2) - -plong1 <- plong1 |> dplyr::filter(!is_unwanted_fname(fname)) -plong2 <- plong2 |> dplyr::filter(!is_unwanted_fname(fname)) -plong <- rbind(plong1, plong2) |> dplyr::filter(!is_unwanted_fname(fname)) - - #pmat |> -# as.data.frame()|> -# tibble::rownames_to_column('site')|> -# pivot_longer(-site,names_to='fname',values_to='abundance')|> -# left_join(rbind(pfnames1,pfnames2))|> -# group_by(site,fname,aliquot,cohort) |> -# summarize(meanAbundance=mean(abundance,na.rm=T)) |> -# left_join(meta) - - -compsites <- plong|> -# subset(meanAbundance>(-5))|> - group_by(site)|> - summarize(spec = n_distinct(Specimen))|> - subset(spec==31) - -#plong$meanAbundance[which(!is.finite(plong$meanAbundance))]<-0 - -ppcs<-plong|>ungroup()|> - dplyr::select(Specimen,meanAbundance,site)|> - unique()|> - subset(site%in%compsites$site)|> - #subset(!is.na(site))|> - #subset(!is.na(meanAbundance))|> - tidyr::pivot_wider(names_from='Specimen',values_from='meanAbundance', - values_fn=mean,values_fill=0)|> - tibble::column_to_rownames('site')|> - t()|> - prcomp() - -pplot<-ppcs$x|> - as.data.frame()|> - dplyr::select(PC1,PC2,PC3)|> - tibble::rownames_to_column('Specimen')|> - left_join(meta)|> - dplyr::select(PC2,PC1,Specimen,Patient,cohort)|> - mutate(cohort=as.factor(cohort))|> - distinct()|> - ggplot(aes(x=PC1,y=PC2,label=Specimen,col=Patient,shape=cohort))+ - geom_point()+ - #ggrepel::geom_label_repel()+ - ggtitle("Phospho samples")+ - ggplot2::scale_color_manual(values=pcols) - -ph<- plong |>ungroup()|> - subset(site%in%compsites$site)|> - ggplot(aes(x=meanAbundance,fill=as.factor(cohort)))+geom_histogram() - -cowplot::plot_grid(ph,pplot) -ggsave('cNFPhosphoQC.png',width=10) - -pplot -ggsave('phosphoPCA.pdf') -``` - -Clearly there is a strong batch effect. -## Normalize golbal proteomics -Now we can move onto the global data - -### Cohort 1 global -Global proteomics in cohort 1 here. - -```{r global} -####now process global -#global1<-readr::read_tsv(syn$get('syn64906445')$path) -global1 <- read.table(syn$get('syn69947355')$path,sep='\t',header=T,quote='"') |> - tidyr::separate_rows(Genes,sep=';') -##logtransform, medina transform - -#global1[which(global1==0,arr.ind=TRUE)]<-NA - -gmat1<-apply(log2(global1[,5:ncol(global1)]),2,function(x) 0.6745 *(x-median(x,na.rm=T))/mad(x,na.rm=T)) - -gmat1<-gmat1|> - as.data.frame()|> - mutate(Genes=global1$Genes) - -##extract aliquot info from file name -gfnames1 <- data.frame(fname=colnames(global1)[5:ncol(global1)]) |> - mutate(aliquot=sapply(fname,function(x) unlist(strsplit(x,split='_'))[6])) |> - mutate(aliquot=as.double(aliquot))|> - mutate(cohort=1)|> - dplyr::filter(!is_unwanted_fname(fname)) - -glong1<-gmat1|> - tidyr::pivot_longer(1:(ncol(gmat1)-1),names_to='fname',values_to='abundance')|> - left_join(gfnames1)|> - group_by(Genes,fname,aliquot,cohort)|> - summarize(meanAbundance=mean(abundance))|> - subset(is.finite(meanAbundance))|> - left_join(meta) - - -readr::write_csv(glong1,file='log2normMedCenteredGlobal.csv') -syn$store(File('log2normMedCenteredGlobal.csv',parentId='syn70078365')) -``` - -### Cohort 2 global -October 7 we process the second cohort. -```{r batch 2 global} -global2<-read.table(syn$get('syn69947352')$path,header=T,sep='\t',quote='"')|> - tidyr::separate_rows(Genes,sep=';') - -#global2[which(global2==0,arr.ind=TRUE)]<-NA - -gmat2<-apply(log2(global2[,5:ncol(global2)]),2,function(x) - 0.6745 *(x-median(x,na.rm=T))/mad(x,na.rm=T)) -rownames(gmat2)<-global2$Genes - -gmat2<-gmat2|> - as.data.frame()|> - mutate(Genes=global2$Genes) - -gfnames2 <- data.frame(fname=colnames(global2)[5:ncol(global2)]) |> - mutate(aliquot=sapply(fname,function(x) unlist(strsplit(x,split='_'))[7])) |> - mutate(aliquot=as.double(aliquot))|> - mutate(cohort=2)|> - dplyr::filter(!is_unwanted_fname(fname)) -gfnames1 - -glong2<-gmat2|> - tidyr::pivot_longer(1:(ncol(gmat2)-1),names_to='fname',values_to='abundance')|> - left_join(gfnames2)|> - group_by(Genes,fname,aliquot,cohort)|> - summarize(meanAbundance=mean(abundance))|> - subset(is.finite(meanAbundance))|> - left_join(meta) - -#dupes<-global|>group_by(Genes)|>summarize(numIso=n())|> -# subset(numIso>1) - - -readr::write_csv(glong2,file='log2normMedCenteredGlobal_cohort2.csv') -syn$store(File('log2normMedCenteredGlobal_cohort2.csv',parentId='syn70078365')) -``` - -### Global combined without batch correction -Now we can combine the global withot batch correction. - -```{r combined global test} -#ma<-mean(glong$abundance,na.rm=T) -#glong$meanAbundance[which(!is.finite(glong$meanAbundance))]<-0 -glong1 <- glong1 |> dplyr::filter(!is_unwanted_fname(fname)) -glong2 <- glong2 |> dplyr::filter(!is_unwanted_fname(fname)) -glong <- rbind(glong1, glong2) |> dplyr::filter(!is_unwanted_fname(fname)) |> - subset(Genes!="") - -n_spec <- meta |> dplyr::distinct(Specimen) |> nrow() -compsites <- glong|> -# subset(meanAbundance>(-5))|> - group_by(Genes)|> - summarize(spec = n_distinct(Specimen))|> - subset(spec==n_spec) - -gpcs<-glong|>ungroup()|> - dplyr::select(Specimen,meanAbundance,Genes)|> - subset(!is.na(Genes))|> - subset(Genes!="")|> - subset(Genes%in%compsites$Genes)|> - subset(!is.na(meanAbundance))|> - tidyr::pivot_wider(names_from='Specimen',values_from='meanAbundance',values_fn=mean,values_fill=0)|> - tibble::column_to_rownames('Genes')|>t()|> - prcomp() - -gplot<-gpcs$x|> - as.data.frame()|> - dplyr::select(PC1,PC2)|> - tibble::rownames_to_column('Specimen')|> - left_join(meta)|> - dplyr::select(PC1,PC2,Specimen,Patient,cohort)|> - mutate(cohort=as.factor(cohort))|> - distinct()|> - ggplot(aes(x=PC1,y=PC2,label=Specimen,col=Patient,shape=cohort))+ - geom_point()+ggrepel::geom_label_repel()+ggtitle("Global samples")+ - scale_color_manual(values=pcols) - - -hplot <- ggplot(glong,aes(x=meanAbundance,fill=as.factor(cohort)))+geom_histogram() - - -cowplot::plot_grid(hplot,gplot) -ggsave('cNFGlobalQC.png',width=10) - - -gplot - -ggsave('globalPCA.pdf') - -``` - - -## Evaluate batch correction - -Now we have two separate long tables with metadata, but we would like to combine into a single one and batch correct.We can update this with each cohort. - -```{r combine and correct} - -##phospho -##TODO: ideally we should use the long tables and reconvert -pmat <- merge(as.data.frame(pmat1),as.data.frame(pmat2)) - -gmat <- merge(gmat1,gmat2) - -##remove duplicated sites -dsites<- unique(pmat$site[which(duplicated(pmat$site))]) -mvals<-sapply(dsites,function(x) colSums(pmat[pmat$site==x,2:ncol(pmat)])) |> - t() |> - as.data.frame() |> - tibble::rownames_to_column('site') - -pmat <- pmat |> - subset(!site %in% dsites) |> - rbind(mvals) - -##now convert to matrix -pmat <- pmat |> - tibble::remove_rownames() |> - tibble::column_to_rownames('site') |> - as.matrix() - -gmat <- gmat |> - subset(Genes!='')|> - tibble::remove_rownames() |> - tibble::column_to_rownames('Genes') |> - as.matrix() -##sigh, batch correct? -library(sva) -# -# pmat[which(!is.finite(pmat),arr.ind=T)] <- 0.0 -# cbmat<-sva::ComBat(pmat,batch=meta$cohort,mean.only = FALSE) -# -# gmat[which(!is.finite(gmat),arr.ind=T)] <- 0.0 -# cgmat <- sva::ComBat(gmat,batch=meta$cohort,mean.only = FALSE) - - -pmat[which(!is.finite(pmat),arr.ind=T)] <- 0.0 -## align ComBat batch vector to pmat columns via filename→cohort maps -sample_meta_p <- data.frame(fname = colnames(pmat)) |> - dplyr::left_join(rbind(pfnames1, pfnames2), by = "fname") |> - dplyr::select(fname, cohort) -keep_p <- !is.na(sample_meta_p$cohort) -pmat <- pmat[, keep_p, drop = FALSE] -pbatch <- sample_meta_p$cohort[keep_p] -cbmat <- sva::ComBat(dat = pmat, batch = pbatch, mean.only = FALSE) - -gmat[which(!is.finite(gmat),arr.ind=T)] <- 0.0 -## align ComBat batch vector to gmat columns via filename→cohort maps -sample_meta_g <- data.frame(fname = colnames(gmat)) |> - dplyr::left_join(rbind(gfnames1, gfnames2), by = "fname") |> - dplyr::select(fname, cohort) -keep_g <- !is.na(sample_meta_g$cohort) -gmat <- gmat[, keep_g, drop = FALSE] -gbatch <- sample_meta_g$cohort[keep_g] -cgmat <- sva::ComBat(dat = gmat, batch = gbatch, mean.only = FALSE) - - -ppcs<-prcomp(t(cbmat)) -gpcs<-prcomp(t(cgmat)) - -``` - -# plot batch corrected data -```{r} - -# Reusable theme: white background + light grey gridlines -my_theme <- theme_bw() + - theme( - panel.grid.major = element_line(color = "grey85"), - panel.grid.minor = element_line(color = "grey92"), - panel.background = element_rect(fill = "white", color = NA), - plot.background = element_rect(fill = "white", color = NA), - legend.title = element_text(size = 9), - legend.text = element_text(size = 7), - legend.key.size = unit(0.5, "cm"), - legend.spacing.y = unit(0.2, "cm"), - legend.box.spacing= unit(0.3, "cm") - ) - - -# --- Corrected phospho plot (white bg + grey grid) --- -pplot <- ppcs$x |> - as.data.frame() |> - dplyr::select(PC1, PC2) |> - tibble::rownames_to_column("fname") |> - left_join(rbind(pfnames1, pfnames2)) |> - left_join(meta) |> - dplyr::select(PC1, PC2, Specimen, Patient, cohort) |> - mutate( - Tumor = str_extract(Specimen, "_T\\d+") |> str_remove("_"), - Tumor = factor(Tumor), - cohort = as.factor(cohort) - ) |> - distinct() |> - dplyr::filter(!is.na(Patient), !is.na(Tumor)) |> - ggplot(aes(x = PC1, y = PC2, col = Patient, shape = Tumor)) + - geom_point(size = 3) + - ggtitle("Corrected phospho samples") + - scale_color_manual(values = pcols) + - scale_shape_discrete(na.translate = FALSE) + - my_theme - -pplot - -# --- Corrected global plot (white bg + grey grid) --- -gplot <- gpcs$x |> - as.data.frame() |> - dplyr::select(PC1, PC2) |> - tibble::rownames_to_column("fname") |> - left_join(rbind(gfnames1, gfnames2)) |> - left_join(meta) |> - dplyr::select(PC1, PC2, Specimen, Patient, cohort) |> - mutate( - Tumor = str_extract(Specimen, "_T\\d+") |> str_remove("_"), - Tumor = factor(Tumor), - cohort = as.factor(cohort) - ) |> - distinct() |> - ggplot(aes(x = PC1, y = PC2, col = Patient, shape = Tumor)) + - geom_point(size = 3) + - ggtitle("Corrected global samples") + - scale_color_manual(values = pcols) + - my_theme - -gplot - - -ggsave("phosphoCorrectedPCA.pdf", plot = pplot, width = 7, height = 4.5, units = "in", device = cairo_pdf) -ggsave("globalCorrectedPCA.pdf", plot = gplot, width = 7, height = 4.5, units = "in", device = cairo_pdf) - -``` - -## Upload batch-corrected data to synapse - -Now we can reformat the batch-corrected data and upload to syanps - -```{r upload batch corrected} - -pc_long <- cbmat |> - as.data.frame() |> - tibble::rownames_to_column('site') |> - pivot_longer(-site,names_to = 'fname',values_to = 'correctedAbundance') |> - left_join(rbind(pfnames1,pfnames2)) |> - left_join(meta) |> - distinct() - -gc_long <- cgmat |> - as.data.frame() |> - tibble::rownames_to_column('Gene') |> - pivot_longer(-Gene,names_to = 'fname',values_to = 'correctedAbundance') |> - left_join(rbind(gfnames1,gfnames2)) |> - left_join(meta) |> - distinct() - - -readr::write_csv(pc_long,file = 'batch12_correctedPhospho.csv') -readr::write_csv(gc_long,file = 'batch12_correctedGlobal.csv') - -syn$store(File('batch12_correctedPhospho.csv',parentId = 'syn70078365')) -syn$store(File('batch12_correctedGlobal.csv',parentId = 'syn70078365')) - -``` diff --git a/03_drug_biomarkers_legacy_code.Rmd b/03_drug_biomarkers_legacy_code.Rmd new file mode 100644 index 0000000..ae9d97c --- /dev/null +++ b/03_drug_biomarkers_legacy_code.Rmd @@ -0,0 +1,1265 @@ +--- +title: "Evaluate drug and omics data for biomarker assessment" +author: "Sara gosline" +date: "2025-03-13" +output: html_document +--- + +This document is designed to be a working document where we can compare approaches to evaluate biomarkers of drug response across patient samples. We are collecting three types of data modalities: 1. RNA Sequencing 2. Global Proteomics 3. Phospho proteomics + +We also have drug sensitivity data (single dose viability, some curves) for many drugs. The question to ask is which molecules can predict drug response across patients? How robust/extendable is this? + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +library(synapser) +library(ggplot2) +library(dplyr) +library(tidyr) + +``` + +# Pull processed files from previous markdowns + +We have already run the previous scripts and stored data on Synapse + +```{r pull files} +Sys.setenv() + +source("cNF_helper_code.R") +traceback() + +source('cNF_helper_code.R') +##read in drug code +fits <- readr::read_tsv(synGet('syn69947322')$path) + + +##read in proteomic data +glong <- readr::read_csv(synGet('syn70078416')$path) +plong <- readr::read_csv(synGet('syn70078415')$path) + + +##read in transcrniptomic data +#TODO: process transcritpomic data into long format + + +``` + +## Format protein data to collect correlation values + +Do simple correlations to identify putative trends in the data. + +Get most efficacious, variable, and heatmap + +```{r} +# ensure an output folder + +outdir <- "figs" +if (!dir.exists(outdir)) dir.create(outdir, recursive = TRUE) + +shared <- intersect(fits$improve_sample_id, glong$Specimen) +message(sprintf("Found %d shared samples from %d drug experiments and %d proteomic experiments", +length(shared), length(unique(fits$improve_sample_id)), length(unique(glong$Specimen)))) + +glob_dat <- glong |> +ungroup() |> +subset(Specimen %in% shared) |> +dplyr::select(Specimen, Gene, correctedAbundance) |> +tidyr::pivot_wider( +names_from = "Gene", +values_from = "correctedAbundance", +values_fill = 0, +values_fn = mean +) |> +tibble::column_to_rownames("Specimen") + +phos_dat <- plong |> +ungroup() |> +subset(Specimen %in% shared) |> +dplyr::select(Specimen, site, correctedAbundance) |> +tidyr::pivot_wider( +names_from = "site", +values_from = "correctedAbundance", +values_fill = 0, +values_fn = mean +) |> +tibble::column_to_rownames("Specimen") + +## drug data to matrix here + +drug_dat <- fits |> +subset(dose_response_metric == "uM_viability") |> +dplyr::select(improve_sample_id, improve_drug_id, dose_response_value) |> +tidyr::pivot_wider( +names_from = "improve_drug_id", +values_from = "dose_response_value", +values_fn = mean +) |> +tibble::column_to_rownames("improve_sample_id") + +## summarize drugs + +drug_counts <- fits |> +subset(dose_response_metric == "uM_viability") |> +group_by(improve_drug_id) |> +distinct() |> +summarize( +meanResponse = mean(dose_response_value, na.rm = TRUE), +nMeasured = n_distinct(improve_sample_id), +variability = sd(dose_response_value, na.rm = TRUE), +.groups = "drop" +) + +# -------- Plot 1: most efficacious -------- + +p1 <- drug_counts |> +arrange(desc(meanResponse)) |> +subset(meanResponse < 0.5) |> +ggplot(aes(y = meanResponse, x = improve_drug_id, colour = nMeasured, size = variability)) + +geom_point() + +theme_minimal() + +theme( +axis.text.x = element_text(angle = 45, hjust = 1) # tilt x labels +) + +labs(title = "Most efficacious drugs", +y = "Mean cell viability (fraction)", +x = "Drug") + +ggsave(file.path("figs/most_efficacious.pdf"), p1, width = 12, height = 8, dpi = 300) + +# -------- Plot 2: most variable -------- + +p2 <- drug_counts |> +arrange(desc(variability)) |> +subset(variability > 0.15) |> +as.data.frame() |> +ggplot(aes(y = meanResponse, x = improve_drug_id, colour = nMeasured, size = variability)) + +geom_point() + +theme_minimal() + +theme( +axis.text.x = element_text(angle = 45, hjust = 1) # tilt x labels +) + +labs(title = "Most variable drugs", +y = "Mean cell viability (fraction)", +x = "Drug") + +ggsave(file.path("figs/most_variable.pdf"), p2, width = 12, height = 8, dpi = 300) + +# -------- Plot 3: heatmap of complete-measurement drugs -------- + +fulldrugs <- drug_counts |> +subset(nMeasured == nrow(drug_dat)) + +# Save large, rotate column labels, and shrink font to avoid overlap + +# pheatmap can write directly to a file via the `filename` arg. + +pheatmap::pheatmap( +as.matrix(drug_dat[, fulldrugs$improve_drug_id, drop = FALSE]), +filename = file.path(outdir, "drug_heatmap_large.pdf"), +width = 28, # inches: large so column labels are readable +height = 16, +dpi = 300, +angle_col = 45, # rotate column (drug) labels +fontsize_col = 6, # smaller font for many drugs +cluster_rows = TRUE, +cluster_cols = TRUE, +show_rownames = TRUE, +show_colnames = TRUE +) + +# Optional: also save a PDF (great for vector zooming) + +pheatmap::pheatmap( +as.matrix(drug_dat[, fulldrugs$improve_drug_id, drop = FALSE]), +filename = file.path("figs/drug_heatmap_large.pdf"), +width = 28, +height = 16, +angle_col = 45, +fontsize_col = 6, +cluster_rows = TRUE, +cluster_cols = TRUE, +show_rownames = TRUE, +show_colnames = TRUE +) + +# ---- PRINT plots in the document as well ---- + +print(p1) +print(p2) + +# Print the heatmap to the document (no filename draws it) + +pheatmap::pheatmap( +as.matrix(drug_dat[, fulldrugs$improve_drug_id, drop = FALSE]), +angle_col = 45, +fontsize_col = 6, +cluster_rows = TRUE, +cluster_cols = TRUE, +show_rownames = TRUE, +show_colnames = TRUE +) + +``` + + +# Basic correlation tests + +Can we simply find and rank proteins/psites/transcripts by correlation and do enrichment? + +We can define a simple test to correlate features and drugs and assess significance and correct: + +```{r correlation tests, warning=FALSE, error=FALSE, message = FALSE} + +#this function computes correlations between all columns for each drug/feature matrix, rows are the sample identifiers +#also coputes significance +computeCors <- function(drug_dat,feat_dat,shared){ + + cres <- cor(drug_dat[shared,],feat_dat[shared,],use='pairwise.complete.obs',method='spearman') |> + as.data.frame() |> + tibble::rownames_to_column('drug')|> + tidyr::pivot_longer(cols=2:(1+ncol(feat_dat)),names_to='gene',values_to='cor') |> + arrange(desc(cor)) + + ##now lets try to get significance + csig <- do.call(rbind,lapply(colnames(drug_dat),function(x){ + do.call(rbind,lapply(colnames(feat_dat),function(y){ + pval <- 1.0 + try(pval <- cor.test(drug_dat[shared,x], + feat_dat[shared,y], + use = 'pairwise.complete.obs', + method = 'spearman')$p.value,silent = TRUE) + + return(c(corp = pval,drug = x,gene = y)) + })) |> + as.data.frame() |> + mutate(fdr=p.adjust(unlist(corp),method='fdr')) + })) |> + as.data.frame() |> + mutate(drug = unlist(drug)) |> + mutate(gene = unlist(gene)) + + fullcors <- cres|>left_join(data.frame(csig)) |> + mutate(direction=ifelse(cor<0,'neg','pos')) + + return(fullcors) +} + +``` + +Now that we have a function we can compute correlations of each data type. + +```{r compute feature cors, warning=FALSE, error=FALSE, message = FALSE} + +gcor <- computeCors(drug_dat[,fulldrugs$improve_drug_id],glob_dat,shared) |> + mutate(data='proteins') +pcor <- computeCors(drug_dat[,fulldrugs$improve_drug_id],phos_dat,shared) |> + mutate(data = 'phosphosites') + +allcor <- rbind(gcor,pcor) + +corsummary<-allcor |> subset(fdr<0.25) |> + group_by(drug,data,direction) |> + summarize(features=n(),meanCor=mean(cor)) + + +p_features <- corsummary |> + subset(features > 1) |> + ggplot(aes(x = drug,y = features,fill = direction)) + + facet_grid(data~.) + + geom_bar(position='dodge',stat='identity') + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) + +ggsave(filename = file.path("figs/cor_features_by_drug.pdf"), + plot = p_features, width = 12, height = 6, units = "in") + +corsummary |> + arrange(desc(features)) |> + subset(features > 100) |> + dplyr::select(drug,data,direction,features,meanCor) + + + +``` + +Now we have the correlation values. what do we do with them? +## Correlation based enrichment +Do not run this everytime - it is extremely slow, so its setup to run once and save the data. The next steps load this data. +```{r functional enrichment} +# === Direction-aware leapR enrichment: run "top" (up) and "bottom" (down) separately === +# Requires: glob_dat (samples x proteins), phos_dat (samples x phosphosites), fits (drug responses) +# Outputs: +# prot_enrich[[drug]]$top / $bottom +# phos_enrich[[drug]]$top / $bottom +# Optional CSVs in folder "leapR_top_paths/dir_split/" + +library(dplyr) +library(tidyr) +library(stringr) +library(SummarizedExperiment) +library(leapR) + +# ---- choose drugs to run (use your two, or set to a larger list) ---- +# target_drugs <- c("THZ1", "Onalespib") +target_drugs <- unique(fits$improve_drug_id) + +# ---- genesets ---- +data(msigdb); geneset_db <- msigdb +data(kinasesubstrates) + +# ---- helpers ---- +extract_gene_from_site <- function(site_id) { + if (is.na(site_id) || site_id == "") return(NA_character_) + g <- str_split(as.character(site_id), "[:_\\-\\.]")[[1]][1] + toupper(stringr::str_extract(g, "^[A-Za-z0-9]+")) +} + +# Correlate response vector vs each feature column (Spearman) +col_spearman <- function(vec, mat) { + shared <- intersect(names(vec), rownames(mat)) + if (length(shared) < 3) return(setNames(rep(NA_real_, ncol(mat)), colnames(mat))) + v <- vec[shared] + m <- as.matrix(mat[shared, , drop = FALSE]) + apply(m, 2, function(col) { + if (all(is.na(col))) return(NA_real_) + if (sd(col, na.rm = TRUE) == 0 || sd(v, na.rm = TRUE) == 0) return(NA_real_) + suppressWarnings(cor(v, col, method = "spearman", use = "pairwise.complete.obs")) + }) +} + +# Build one-column SE; column name must match primary_columns +build_se_from_corvec <- function(cor_named_vec, features_all, col_label, map_to_gene = NULL, assay_label = "proteomics") { + v <- rep(NA_real_, length(features_all)); names(v) <- features_all + common <- intersect(names(cor_named_vec), features_all) + v[common] <- cor_named_vec[common] + mat <- matrix(v, nrow = length(v), ncol = 1, dimnames = list(features_all, col_label)) + rd <- DataFrame(feature_id = features_all) + rd$hgnc_id <- if (is.null(map_to_gene)) features_all else map_to_gene[features_all] + se <- SummarizedExperiment(assays = list(values = mat), rowData = rd, colData = DataFrame(sample = col_label)) + assayNames(se) <- assay_label + se +} + +safe_leapr <- function(...) tryCatch(leapR::leapR(...), error = function(e) { message("[leapR] ", conditionMessage(e)); NULL }) + +# ---- feature and mapping vectors ---- +prot_features <- colnames(glob_dat) +phos_features <- colnames(phos_dat) +phos_to_gene <- setNames(vapply(phos_features, extract_gene_from_site, FUN.VALUE = character(1)), + phos_features) + +# ---- results containers ---- +prot_enrich <- list() +phos_enrich <- list() + +# optional: write CSVs? +write_csvs <- TRUE +outdir <- "leapR_top_paths/dir_split" +if (write_csvs && !dir.exists(outdir)) dir.create(outdir, recursive = TRUE) + +# ---- per-drug workflow ---- +for (drug in target_drugs) { + message("=== ", drug, " ===") + # build response vector (mean per sample if repeats) + dv <- fits %>% + filter(improve_drug_id == !!drug, dose_response_metric == "uM_viability") %>% + group_by(improve_sample_id) %>% summarize(resp = mean(dose_response_value, na.rm = TRUE), .groups = "drop") + if (nrow(dv) == 0) { message(" no response rows; skipping"); next } + dv_vec <- setNames(dv$resp, dv$improve_sample_id) + + # correlations + prot_cor <- col_spearman(dv_vec, glob_dat) + phos_cor <- col_spearman(dv_vec, phos_dat) + + # split by sign + prot_pos <- prot_cor[!is.na(prot_cor) & prot_cor > 0] + prot_neg <- prot_cor[!is.na(prot_cor) & prot_cor < 0] + phos_pos <- phos_cor[!is.na(phos_cor) & phos_cor > 0] + phos_neg <- phos_cor[!is.na(phos_cor) & phos_cor < 0] + + # ---- global: TOP (positives as-is), BOTTOM (negatives flipped so they rank to the top) ---- + prot_enrich[[drug]] <- list(top = NULL, bottom = NULL) + + # TOP + if (length(prot_pos) >= 5) { + se_prot_top <- build_se_from_corvec(prot_pos, prot_features, col_label = paste0(drug, "_TOP"), assay_label = "proteomics") + prot_top <- safe_leapr(geneset = geneset_db, enrichment_method = "enrichment_in_order", + eset = se_prot_top, assay_name = "proteomics", + primary_columns = paste0(drug, "_TOP")) + prot_enrich[[drug]]$top <- prot_top + if (write_csvs && !is.null(prot_top)) { + write.csv(as.data.frame(prot_top), file = file.path(outdir, paste0(drug, "_global_TOP.csv"))) + } + } else message(" PROT top: too few positive features (", length(prot_pos), ")") + + # BOTTOM (flip sign so more negative = larger positive rank) + if (length(prot_neg) >= 5) { + se_prot_bot <- build_se_from_corvec(-prot_neg, prot_features, col_label = paste0(drug, "_BOTTOM"), assay_label = "proteomics") + prot_bot <- safe_leapr(geneset = geneset_db, enrichment_method = "enrichment_in_order", + eset = se_prot_bot, assay_name = "proteomics", + primary_columns = paste0(drug, "_BOTTOM")) + prot_enrich[[drug]]$bottom <- prot_bot + if (write_csvs && !is.null(prot_bot)) { + write.csv(as.data.frame(prot_bot), file = file.path(outdir, paste0(drug, "_global_BOTTOM.csv"))) + } + } else message(" PROT bottom: too few negative features (", length(prot_neg), ")") + + # ---- PHOSPHO: TOP/BOTTOM for pathways (gene mapping via hgnc_id) ---- + phos_enrich[[drug]] <- list(top = NULL, bottom = NULL) + + # TOP + if (length(phos_pos) >= 5) { + se_phos_top <- build_se_from_corvec(phos_pos, phos_features, col_label = paste0(drug, "_TOP"), + map_to_gene = phos_to_gene, assay_label = "phosphoproteomics") + phos_top <- safe_leapr(geneset = geneset_db, enrichment_method = "enrichment_in_order", + eset = se_phos_top, assay_name = "phosphoproteomics", + primary_columns = paste0(drug, "_TOP"), id_column = "hgnc_id") + phos_enrich[[drug]]$top <- phos_top + if (write_csvs && !is.null(phos_top)) { + write.csv(as.data.frame(phos_top), file = file.path(outdir, paste0(drug, "_phospho_TOP.csv"))) + } + } else message(" PHOS top: too few positive features (", length(phos_pos), ")") + + # BOTTOM (flip) + if (length(phos_neg) >= 5) { + se_phos_bot <- build_se_from_corvec(-phos_neg, phos_features, col_label = paste0(drug, "_BOTTOM"), + map_to_gene = phos_to_gene, assay_label = "phosphoproteomics") + phos_bot <- safe_leapr(geneset = geneset_db, enrichment_method = "enrichment_in_order", + eset = se_phos_bot, assay_name = "phosphoproteomics", + primary_columns = paste0(drug, "_BOTTOM"), id_column = "hgnc_id") + phos_enrich[[drug]]$bottom <- phos_bot + if (write_csvs && !is.null(phos_bot)) { + write.csv(as.data.frame(phos_bot), file = file.path(outdir, paste0(drug, "_phospho_BOTTOM.csv"))) + } + } else message(" PHOS bottom: too few negative features (", length(phos_neg), ")") + +} + +# Save all direction-split results for later reuse +save(prot_enrich, phos_enrich, file = "leapR_enrichment_direction_split.Rdata") + +message("Finished direction-aware enrichment. Results in lists prot_enrich / phos_enrich, and CSVs (if enabled).") + + +``` + + + +For each drug, how many terms do we see active? how many kinases? +```{r functional enrichment} +# ==== Load saved enrichment & build summaries (no list-casts, no count()) ==== +library(dplyr) +library(tidyr) +library(purrr) +library(tibble) +library(ggplot2) +library(forcats) +library(stringr) +library(scales) + +# Always load the precomputed enrichment lists here +load("leapR_enrichment_direction_split.Rdata") +if (!exists("prot_enrich")) stop("prot_enrich not found in leapR_enrichment_direction_split.Rdata") +if (!exists("phos_enrich")) stop("phos_enrich not found in leapR_enrichment_direction_split.Rdata") + +alpha <- 0.05 +topN <- 15 # <<< top 15 +dirs <- c("resistant","sensitive") + +# ---------- helpers ---------- +pick_pcol <- function(df) { + cols <- colnames(df) + if ("BH_pvalue" %in% cols) return(list(kind="adj", col="BH_pvalue")) + if ("SignedBH_pvalue" %in% cols) return(list(kind="adj", col="SignedBH_pvalue")) + if ("adj.P.Val" %in% cols) return(list(kind="adj", col="adj.P.Val")) + if ("padj" %in% cols) return(list(kind="adj", col="padj")) + if ("pvalue" %in% cols) return(list(kind="raw", col="pvalue")) + if ("P.Value" %in% cols) return(list(kind="raw", col="P.Value")) + NULL +} + +extract_term_col <- function(df) { + cands <- c("term","Term","pathway","Pathway","set","Set","geneset","gene_set","Category") + hit <- cands[cands %in% names(df)] + if (length(hit)) hit[[1]] else NULL +} + +tidy_one_result <- function(x) { + if (is.null(x)) return(tibble(pathway = character(), adj_p = numeric())) + df <- as.data.frame(x) + if (!nrow(df)) return(tibble(pathway = character(), adj_p = numeric())) + + term_col <- extract_term_col(df) + if (is.null(term_col)) { + df <- tibble::rownames_to_column(df, "pathway") + } else { + df <- dplyr::mutate(df, pathway = .data[[term_col]]) + } + df$pathway <- as.character(df$pathway) + + pk <- pick_pcol(df) + if (is.null(pk)) return(tibble(pathway = character(), adj_p = numeric())) + adj <- if (pk$kind == "adj") df[[pk$col]] else p.adjust(df[[pk$col]], method = "BH") + + tibble(pathway = df$pathway, adj_p = as.numeric(adj)) |> + filter(is.finite(adj_p), !is.na(adj_p)) +} + +flatten_by_direction <- function(lst, omic_label) { + if (!length(lst)) return(tibble()) + purrr::imap_dfr(lst, function(two, drug) { + bind_rows( + tidy_one_result(two$top) |> mutate(direction = "resistant"), + tidy_one_result(two$bottom) |> mutate(direction = "sensitive") + ) |> + mutate(drug = as.character(drug), omic = omic_label) + }) +} + +# ---------- long-format enrichment and significance filter ---------- +prot_long <- flatten_by_direction(prot_enrich, "global") +phos_long <- flatten_by_direction(phos_enrich, "phospho") +enrich_long <- bind_rows(prot_long, phos_long) |> as_tibble() +stopifnot(all(c("pathway","adj_p","direction","drug","omic") %in% names(enrich_long))) + +enrich_sig <- enrich_long |> + filter(is.finite(adj_p), !is.na(adj_p), adj_p < alpha) + +if (nrow(enrich_sig) == 0) { + message("No significant pathways at FDR < ", alpha, ".") + pathway_summary <- tibble() + drug_counts <- tibble() +} else { + pathway_summary <- enrich_sig |> + group_by(omic, direction, pathway) |> + summarise(n_drugs = n_distinct(drug), .groups = "drop") |> + arrange(desc(n_drugs)) + + all_drugs <- sort(unique(enrich_long$drug)) + drug_counts <- enrich_sig |> + group_by(drug, direction) |> + summarise(n_pathways = n_distinct(pathway), .groups = "drop") |> + complete(drug = all_drugs, direction = dirs, fill = list(n_pathways = 0L)) |> + arrange(drug, direction) + + # ---------- summary figures ---------- + dir.create("figs", showWarnings = FALSE) + + reorder_within <- function(x, by, within, sep = "___") { + x2 <- paste(x, within, sep = sep); stats::reorder(x2, by) + } + scale_y_reordered_wrap <- function(width = 32, sep = "___") { + ggplot2::scale_y_discrete( + labels = function(x) stringr::str_wrap(gsub(paste0(sep, ".*$"), "", x), width = width) + ) + } + + pathway_summary_top <- pathway_summary |> + group_by(omic, direction) |> + slice_max(order_by = n_drugs, n = topN, with_ties = FALSE) |> + ungroup() |> + mutate(pathway_in_omic = reorder_within(pathway, n_drugs, omic)) + + p_pathways <- ggplot(pathway_summary_top, + aes(y = pathway_in_omic, x = n_drugs, fill = direction)) + + geom_col(position = position_dodge(width = 0.85), width = 0.85) + + facet_wrap(~ omic, scales = "free_y") + + scale_y_reordered_wrap(width = 36) + + scale_x_continuous(expand = expansion(mult = c(0, 0.05))) + + labs(title = paste0("Top ", topN, " pathways enriched across drugs"), + y = "Pathway", x = "# Drugs") + + theme_minimal(base_size = 11) + + theme( + legend.position = "top", + strip.text = element_text(face = "bold"), + axis.text.y = element_text(size = 7), + panel.grid.major.x = element_blank(), + plot.margin = margin(5.5, 30, 5.5, 5.5) + ) + + coord_cartesian(clip = "off") + + # --- PDF saves (summary figs) --- + ggsave("figs/pathways_across_drugs_top15.pdf", p_pathways, + width = 12, height = 10, units = "in", device = cairo_pdf) + print(p_pathways) + + drug_counts_full <- drug_counts |> + group_by(drug) |> + mutate(total = sum(n_pathways)) |> + ungroup() |> + mutate(drug = forcats::fct_reorder(drug, total)) + + p_counts <- ggplot(drug_counts_full, aes(x = drug, y = n_pathways, fill = direction)) + + geom_col(position = position_dodge(width = 0.9), width = 0.85) + + labs(title = "Number of enriched pathways per drug", + x = "Drug", y = "# Pathways") + + theme_minimal(base_size = 11) + + theme( + legend.position = "top", + axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 6), + panel.grid.major.x = element_blank() + ) + + ggsave("figs/enriched_pathways_per_drug_wide.pdf", p_counts, + width = 18, height = 7, units = "in", device = cairo_pdf) + print(p_counts) +} + +# ---------- Top-2 most efficacious & most variable drugs ---------- +top2_efficacious <- character(0) +top2_variable <- character(0) + +if (exists("fits")) { + eff_tbl <- fits |> + filter(dose_response_metric == "uM_viability") |> + group_by(improve_drug_id) |> + summarise( + meanResponse = mean(dose_response_value, na.rm = TRUE), + variability = sd(dose_response_value, na.rm = TRUE), + nMeasured = dplyr::n_distinct(improve_sample_id), + .groups = "drop" + ) + top2_efficacious <- eff_tbl |> + arrange(meanResponse, desc(nMeasured)) |> + slice_head(n = 2) |> + pull(improve_drug_id) + top2_variable <- eff_tbl |> + arrange(desc(variability), desc(nMeasured)) |> + slice_head(n = 2) |> + pull(improve_drug_id) +} else { + some <- unique(enrich_long$drug) + top2_efficacious <- head(some, 2) + top2_variable <- head(rev(some), 2) +} + +# ---- Force-include Onalespib in top_interest (case-insensitive) ---- +ona_matches <- unique(enrich_long$drug[grepl("^onalespib$", enrich_long$drug, ignore.case = TRUE)]) +if (length(ona_matches) == 0) ona_matches <- "Onalespib" + +top_interest <- unique(c(top2_efficacious, top2_variable, ona_matches)) +message("Top-2 most efficacious (lowest mean viability): ", paste(top2_efficacious, collapse = ", ")) +message("Top-2 most variable (highest SD): ", paste(top2_variable, collapse = ", ")) +message("Force-included: ", paste(ona_matches, collapse = ", ")) + +# ---------- per-drug pathway barplots: always top 15, star-annotate significance ---------- +pick_pcol_plot <- function(df) { + cols <- colnames(df) + if ("BH_pvalue" %in% cols) return("BH_pvalue") + if ("SignedBH_pvalue" %in% cols) return("SignedBH_pvalue") + if ("adj.P.Val" %in% cols) return("adj.P.Val") + if ("padj" %in% cols) return("padj") + if ("pvalue" %in% cols) return("pvalue") + if ("P.Value" %in% cols) return("P.Value") + NA_character_ +} + +sig_stars <- function(p) dplyr::case_when( + is.na(p) ~ "", + p < 0.001 ~ "***", + p < 0.01 ~ "**", + p < 0.05 ~ "*", + TRUE ~ "" +) + +prep_plot_df <- function(res_df, n = 15) { + if (is.null(res_df)) return(NULL) + df <- as.data.frame(res_df); if (!nrow(df)) return(NULL) + if (!("feature" %in% names(df))) df <- tibble::rownames_to_column(df, "feature") + col <- pick_pcol_plot(df); if (is.na(col)) return(NULL) + + # Always compute BH adj p; then take top 15 by smallest adj_p (no significance filter) + adj_p <- if (col %in% c("pvalue","P.Value")) p.adjust(df[[col]], method = "BH") else df[[col]] + + df |> + mutate( + adj_p = as.numeric(adj_p), + score = -log10(pmax(adj_p, 1e-300)), + stars = sig_stars(adj_p), + signif = !is.na(adj_p) & adj_p < 0.05, + feature = stringr::str_wrap(as.character(feature), width = 40) + ) |> + filter(is.finite(adj_p)) |> + arrange(adj_p, desc(score)) |> + slice_head(n = n) +} + +plot_bar <- function(df_plot, title_text, label_type) { + if (is.null(df_plot) || !nrow(df_plot)) return(NULL) + lbl <- ifelse(label_type == "top", "Resistant pathways", "Sensitive pathways") + fillc <- ifelse(label_type == "top", "#D7191C", "#2C7BB6") + star_pad <- 0.15 + + ggplot(df_plot, aes(x = reorder(feature, score), y = score)) + + geom_col(aes(alpha = signif), fill = fillc, width = 0.85) + + scale_alpha_manual(values = c(`FALSE` = 0.5, `TRUE` = 1), guide = "none") + + geom_text(aes(y = score + star_pad, label = stars), + size = 3, hjust = 0) + + coord_flip(clip = "off") + + scale_y_continuous(expand = expansion(mult = c(0, 0.15))) + + labs(title = paste0(title_text, ", ", lbl), + x = NULL, y = expression(-log[10]("FDR"))) + + theme_minimal(base_size = 9) + + theme( + plot.title = element_text(size = 11), + axis.text.y = element_text(size = 7), + axis.text.x = element_text(size = 8), + plot.margin = margin(5.5, 30, 5.5, 5.5) + ) +} + +safelabel <- function(x) gsub("[^A-Za-z0-9_.-]", "_", x) + +dir.create("figs", showWarnings = FALSE) +for (drug in top_interest) { + pt <- if (drug %in% names(prot_enrich)) prot_enrich[[drug]]$top else NULL + pb <- if (drug %in% names(prot_enrich)) prot_enrich[[drug]]$bottom else NULL + ft <- if (drug %in% names(phos_enrich)) phos_enrich[[drug]]$top else NULL + fb <- if (drug %in% names(phos_enrich)) phos_enrich[[drug]]$bottom else NULL + + p1 <- plot_bar(prep_plot_df(pt, n = 15), paste0(drug, " — global"), "top") + p2 <- plot_bar(prep_plot_df(pb, n = 15), paste0(drug, " — global"), "bottom") + p3 <- plot_bar(prep_plot_df(ft, n = 15), paste0(drug, " — Phospho"), "top") + p4 <- plot_bar(prep_plot_df(fb, n = 15), paste0(drug, " — Phospho"), "bottom") + + if (!is.null(p1)) { + ggsave(file.path("figs", paste0("pathways_", safelabel(drug), "_global_resistant_top15.pdf")), + p1, width = 7, height = 5, device = cairo_pdf); print(p1) + } + if (!is.null(p2)) { + ggsave(file.path("figs", paste0("pathways_", safelabel(drug), "_global_sensitive_top15.pdf")), + p2, width = 7, height = 5, device = cairo_pdf); print(p2) + } + if (!is.null(p3)) { + ggsave(file.path("figs", paste0("pathways_", safelabel(drug), "_phospho_resistant_top15.pdf")), + p3, width = 7, height = 5, device = cairo_pdf); print(p3) + } + if (!is.null(p4)) { + ggsave(file.path("figs", paste0("pathways_", safelabel(drug), "_phospho_sensitive_top15.pdf")), + p4, width = 7, height = 5, device = cairo_pdf); print(p4) + } +} + + + +``` + + + + +# Print siginficant results for all drugs if we want. +```{r} +library(dplyr) +library(ggplot2) +library(tibble) +library(rlang) + +alpha <- 0.05 # significance threshold +top_n_to_show <- 15 + +# Prefer adjusted p if available; fall back to raw and adjust per-run +pick_pcol <- function(df) { + cols <- colnames(df) + if ("BH_pvalue" %in% cols) return(list(kind="adj", col="BH_pvalue")) + if ("SignedBH_pvalue" %in% cols) return(list(kind="adj", col="SignedBH_pvalue")) + if ("pvalue" %in% cols) return(list(kind="raw", col="pvalue")) + return(NULL) +} + +prep_plot_df <- function(res_df, n = top_n_to_show) { + if (is.null(res_df)) return(NULL) + df <- as.data.frame(res_df) + if (!nrow(df)) return(NULL) + + pick <- pick_pcol(df) + if (is.null(pick)) return(NULL) + + # unify to adj p + if (pick$kind == "adj") { + df <- df %>% mutate(adj_p = !!sym(pick$col)) + } else { # raw p → adjust within this run + df <- df %>% mutate(adj_p = p.adjust(!!sym(pick$col), method = "BH")) + } + + df %>% + rownames_to_column("feature") %>% + arrange(adj_p) %>% + # keep only significant ones; if none, return empty (caller will message) + filter(is.finite(adj_p), !is.na(adj_p), adj_p < alpha) %>% + head(n) %>% + mutate(score = -log10(pmax(adj_p, 1e-300))) +} + +plot_bar <- function(df_plot, title_text, label_type) { + if (is.null(df_plot) || !nrow(df_plot)) { + message(" No significant pathways for ", title_text, " (FDR<", alpha, ").") + return(NULL) + } + # Correct labels & colors for uM_viability convention: + # TOP -> resistant (red), BOTTOM -> sensitive (blue) + lbl <- ifelse(label_type == "top", "Resistant pathways", "Sensitive pathways") + fillc <- ifelse(label_type == "top", "#D7191C", "#2C7BB6") + + p <- ggplot(df_plot, aes(x = reorder(feature, score), y = score)) + + geom_col(fill = fillc) + + coord_flip() + + labs(title = paste0(title_text, ", ", lbl), + x = NULL, y = expression(-log[10]("FDR"))) + + theme_minimal(base_size = 9) + + theme( + plot.title = element_text(size = 11), + axis.text.y = element_text(size = 6), + axis.text.x = element_text(size = 8) + ) + print(p); invisible(p) +} + +plot_drug_panels <- function(drug) { + message("\n=== ", drug, " ===") + + # global + pt <- if (drug %in% names(prot_enrich)) prot_enrich[[drug]]$top else NULL + pb <- if (drug %in% names(prot_enrich)) prot_enrich[[drug]]$bottom else NULL + plot_bar(prep_plot_df(pt), paste0(drug, ", Global Proteomics"), "top") + plot_bar(prep_plot_df(pb), paste0(drug, ", Global Proteomics"), "bottom") + + # Phospho pathways + ft <- if (drug %in% names(phos_enrich)) phos_enrich[[drug]]$top else NULL + fb <- if (drug %in% names(phos_enrich)) phos_enrich[[drug]]$bottom else NULL + plot_bar(prep_plot_df(ft), paste0(drug, ", Phosphoproteomics"), "top") + plot_bar(prep_plot_df(fb), paste0(drug, ", Phosphoproteomics"), "bottom") +} + + +# --- run for your drugs --- +for (d in target_drugs) plot_drug_panels(d) + +``` + + + + +# Basic drug list +```{r} +drug_counts <- fits %>% + filter(dose_response_metric == "uM_viability") %>% + group_by(improve_drug_id) %>% + summarise( + n_rows = dplyr::n(), # total rows/measurements + n_specimens = n_distinct(improve_sample_id), # unique samples tested + meanResponse = mean(dose_response_value, na.rm = TRUE), + sdResponse = sd(dose_response_value, na.rm = TRUE), + .groups = "drop" + ) %>% + arrange(desc(n_specimens), improve_drug_id) + +# Plain list of drugs + total count +drug_list <- sort(unique(drug_counts$improve_drug_id)) +n_drugs <- length(drug_list) + +message(sprintf("Total unique drugs: %d", n_drugs)) +print((drug_list)) + +``` + + + + + + + + + + + + + + + + + + + + + +## Visualization +How should we visualize? Here is some older code +```{r plot cors, eval=FALSE} + +plotCors <- function(features,druglist,dataType='proteins'){ + ##subset a list of features and drugs and plot those in a graph + require(ggplot2) + if(dataType=='proteins'){ + ptab<-glong|>dplyr::rename(feature='Gene') + }else{ + ptab<-plong|>dplyr::rename(feature='site') + } + dtab<-fits|> + subset(dose_response_metric=='uM_viability')|> + dplyr::rename(Specimen='improve_sample_id',Drug='improve_drug_id')|> + subset(Drug%in%druglist) + + + ftab<-features|>left_join(ptab)|>left_join(dtab)|> + subset(!is.na(Drug)) + + feats <- unique(features$feature) + plots <- lapply(feats,function(x){ + corval <- ftab[ftab$feature==x,'cor'] + #corval <- ftab[ftab$feature==x,'pCor'] + + ftab|>subset(feature==x)|> + ggplot(aes(x=correctedAbundance,y=dose_response_value, + col=Patient,size=1))+ + geom_point()+ + facet_grid(~Drug)+ + ggtitle(paste(x,'Drug correlation'))+ + scale_color_manual(values=pcols) + }) + cowplot::plot_grid(plotlist= plots,ncol=2) + +} + +druglist<-c('Onalespib') +features<-subset(allcor,drug%in%druglist)|> + subset(fdr<0.25)|> + subset(abs(cor)>0.7)|> + subset(data=='proteins')|> + arrange(desc(abs(cor))) + +plotCors(rename(features[1:10,],feature='gene'),druglist) + +ggsave('onalespibFDR0.25Cors.pdf',height=20) + +``` + +# Random forest predictor + +Here we try to use random forest to extract predictive features. First we need to assess if the model can accurately predict drug response from the data. From those predictive models, we can extract features/biomarkers. + +First we build the data frames needed - I've included cohort as a covariate but may remove it. + +```{r random forest} +## separate out cohorts for prediction +cohorts <- meta |> + select(Specimen,cohort) |> + distinct() |> + tibble::column_to_rownames('Specimen') + +##for each drug, build model of cohort + protein ~ drug response +#removed cohort for now +gdf <- as.data.frame(glob_dat)#|>mutate(cohort=cohorts[rownames(glob_dat),'cohort']) +gnas <- which(apply(gdf,2,function(x) any(is.na(x)))) + +pdf <- as.data.frame(phos_dat)#|> mutate(cohort=cohorts[rownames(phos_dat),'cohort']) +pnas <- which(apply(pdf,2,function(x) any(is.na(x)))) + +mdf <- meta[-c(2,5),]|> ##have duplication here + tibble::column_to_rownames('Specimen') + + +``` + +Now we can loop through every drug, build model, and assess accuracy. + +```{r evaluate predictivty} + +#trying ou tthis function tos ee how it goes +rfFeatures <- function(drug_dat,fdf, mdf){ + complete_drugs <- which(apply(drug_dat,2, + function(x) length(which(!is.na(x)))==length(x))) + print(paste("Evaluating random forest for ",length(complete_drugs),'drugs')) + all_preds <- do.call(rbind,lapply(names(complete_drugs),function(drug){ + + dg <- fdf#[,-gnas] + + ##create the metadata df with the drug of interest + dmdf <- mdf |> + mutate(drug=drug_dat[rownames(mdf),drug]) + + rf <- randomForest::randomForest(x=dg, + y=drug_dat[rownames(dg),drug], + importance=TRUE,ntree=500) + + im <- randomForest::importance(rf)|> + as.data.frame() |> + mutate(drug=drug) + pord <- intersect(rownames(mdf)[order(drug_dat[rownames(mdf),drug])],rownames(glob_dat)) + + #pheatmap::pheatmap(t(glob_dat[pord, + # rownames(im)]),annotation_col=dmdf, + # cellheight=10,cluster_cols = TRUE) + return(im) + ##what do we return?x + })) + return(all_preds) +} + + +``` + +Now what do we do with the importance features? + +```{r rf processing} + +##get importance for global +gimp <- rfFeatures(drug_dat=drug_dat,fdf=gdf,mdf=mdf) + +##get importance for phospho +pimp <- rfFeatures(drug_dat=drug_dat,fdf=pdf,mdf=mdf) + +``` + + + + + + + + + + + + + + + + + +# Old correlation code, dont run + +now we can visualize correlations + +```{r check out HSP90s, eval=FALSE} + +hsps<-unique(glong$Genes[grep('^HSP',glong$Genes)]) + +cor_hsps<-subset(allcor,gene%in%hsps)|>subset(drug=='Onalespib')|>subset(corp<0.2) +#print(paste('measured',length(hsps),'HSPs in global data of which', nrow(cor_hsps),' are correlated with Onalespib')) + +plotCors(rename(cor_hsps,feature='gene'),c('Onalespib')) + +ggsave('hspCorsOna.pdf',height=nrow(cor_hsps)*3) + +hspp<-unique(plong$site[grep('^HSP',plong$site)]) +cor_hspps<-subset(allcor,gene%in%hspp)|>subset(drug=='Onalespib')|>subset(corp<0.2) +#print(paste('measured',length(hspp),'HSPs in phospho data of which',nrow(cor_hspps),' are correlated with Onalespib')) + +plotCors(rename(cor_hspps,feature='gene'),c('Onalespib'),'phospho') + +ggsave('hspPhosphoCorsOna.pdf',height=nrow(cor_hspps)*3) + + +``` + +### Now lets look only at IC50 values + +There are a few drugs for which we have IC50 values + +```{r check ic50 cors,warning=FALSE,error=FALSE, eval=FALSE} + +ifits<-subset(fits,dose_response_metric=='fit_ic50') + +shared<-intersect(ifits$improve_sample_id,glong$Specimen) +print(paste('Found',length(shared),'shared samples')) + +## a full join might be a challenge, maybe just take two matrices +drug_dat <- ifits|> + dplyr::select(improve_sample_id,improve_drug_id,dose_response_value)|> + tidyr::pivot_wider(names_from='improve_drug_id',values_from='dose_response_value',values_fn=mean)|> + tibble::column_to_rownames('improve_sample_id') + +gres<-cor(drug_dat[shared,],glob_dat[shared,],use='pairwise.complete.obs',method='pearson')|> + as.data.frame()|> + tibble::rownames_to_column('drug')|> + tidyr::pivot_longer(cols=2:(1+ncol(glob_dat)),names_to='gene',values_to='pCor')|> + arrange(desc(pCor)) + +##now lets try to get significance + +gsig<-do.call(rbind,lapply(colnames(drug_dat),function(x){ + do.call(rbind,lapply(colnames(glob_dat),function(y){ + pval<-1.0 + try(pval<-cor.test(drug_dat[shared,x],glob_dat[shared,y],use='pairwise.complete.obs',method='pearson')$p.value,silent=TRUE) + return(c(corp=pval,drug=x,gene=y)) + } + ))|>as.data.frame()|> + mutate(fdr=p.adjust(unlist(corp),method='fdr')) + }))|> + as.data.frame()|> + mutate(drug=unlist(drug))|> + mutate(gene=unlist(gene)) + +fullcors<-gres|>left_join(data.frame(gsig))|>mutate(data='global') + +pres<-cor(drug_dat[shared,],phos_dat[shared,],use='pairwise.complete.obs',method='pearson')|> + as.data.frame()|> + tibble::rownames_to_column('drug')|> + tidyr::pivot_longer(cols=2:(1+ncol(phos_dat)),names_to='site',values_to='pCor')|> + arrange(desc(pCor)) + +##now lets look at correlations + +psig<-do.call(rbind,lapply(colnames(drug_dat),function(x){ + do.call(rbind,lapply(colnames(phos_dat),function(y){ + pval<-1.0 + try(pval<-cor.test(drug_dat[shared,x],phos_dat[shared,y],use='pairwise.complete.obs',method='pearson')$p.value,silent=TRUE) + return(c(corp=pval,drug=x,gene=y)) + } + ))|>as.data.frame()|> + mutate(fdr=p.adjust(unlist(corp),method='fdr')) + }))|>as.data.frame() + +fullpcors<-pres|>rename(gene='site')|>left_join(data.frame(psig))|>mutate(data='phospho') + +#combine all correlations +allcor<-rbind(fullcors,fullpcors)|> + mutate(direction=ifelse(pCor<0,'neg','pos')) + + +##lets count the correlations and plot + +corsummary<-allcor|>subset(fdr<0.1)|> + group_by(drug,data,direction)|> + summarize(features=n(),meanCor=mean(pCor)) + +corsummary|> + #subset(features>1)|> + ggplot(aes(x=data,y=features,fill=drug))+ + facet_grid(~direction)+ + geom_bar(position='dodge',stat='identity') + + + +``` + +Again we have onalespib with numerous significantly correlated proteins, and one phosphosite for digoxin showing up . + +```{r plot individual sites, eval=FALSE} + +druglist<-c('Onalespib') +features<-subset(allcor,drug%in%druglist)|> + subset(fdr<0.05)|> + subset(data=='global') + +plotCors(rename(features,feature='gene'),druglist) + + +druglist<-c('Digoxin') +features<-subset(allcor,drug%in%druglist)|> + subset(fdr<0.1)|> + subset(data=='phospho') + +#plotCors(rename(features,feature='gene'),druglist,data='phospho') + + + +``` + +Now we can check the HSP proteins directly + +```{r HSP correlation, eval=FALSE} +hsps<-unique(glong$Genes[grep('^HSP',glong$Genes)]) + +cor_hsps<-subset(allcor,gene%in%hsps)|>subset(drug=='Onalespib')|>subset(corp<0.05) + +print(paste('measured',length(hsps),'HSPs in global data of which', nrow(cor_hsps),' are correlated with Onalespib')) + +plotCors(rename(cor_hsps,feature='gene'),'Onalespib') + +hspp<-unique(plong$site[grep('^HSP',plong$site)]) +cor_hspps<-subset(allcor,gene%in%hspp)|>subset(drug=='Onalespib')|>subset(corp<0.1) + +print(paste('measured',length(hspp),'HSPs in phospho data of which',nrow(cor_hspps),' are correlated with Onalespib')) +``` + +The IC50 result is similar to the viability. Now we can check AUC + +```{r auc hsp check, error=FALSE, warning=FALSE, eval=FALSE} + +ifits<-subset(fits,dose_response_metric=='auc') + +shared<-intersect(ifits$improve_sample_id,glong$Specimen) +print(paste('Found',length(shared),'shared samples')) + +## a full join might be a challenge, maybe just take two matrices +drug_dat <- ifits|> + dplyr::select(improve_sample_id,improve_drug_id,dose_response_value)|> + tidyr::pivot_wider(names_from='improve_drug_id',values_from='dose_response_value',values_fn=mean)|> + tibble::column_to_rownames('improve_sample_id') + +gres<-cor(drug_dat[shared,],glob_dat[shared,],use='pairwise.complete.obs',method='pearson')|> + as.data.frame()|> + tibble::rownames_to_column('drug')|> + tidyr::pivot_longer(cols=2:(1+ncol(glob_dat)),names_to='gene',values_to='pCor')|> + arrange(desc(pCor)) + +##now lets try to get significance + +gsig<-do.call(rbind,lapply(colnames(drug_dat),function(x){ + do.call(rbind,lapply(colnames(glob_dat),function(y){ + pval<-1.0 + try(pval<-cor.test(drug_dat[shared,x],glob_dat[shared,y],use='pairwise.complete.obs',method='pearson')$p.value,silent=TRUE) + return(c(corp=pval,drug=x,gene=y)) + } + ))|>as.data.frame()|> + mutate(bh_p=p.adjust(unlist(corp),method='BH')) + }))|> + as.data.frame()|> + mutate(drug=unlist(drug))|> + mutate(gene=unlist(gene)) + +fullcors<-gres|>left_join(data.frame(gsig))|>mutate(data='global') + +pres<-cor(drug_dat[shared,],phos_dat[shared,],use='pairwise.complete.obs',method='pearson')|> + as.data.frame()|> + tibble::rownames_to_column('drug')|> + tidyr::pivot_longer(cols=2:(1+ncol(phos_dat)),names_to='site',values_to='pCor')|> + arrange(desc(pCor)) + +##now lets look at correlations + +psig<-do.call(rbind,lapply(colnames(drug_dat),function(x){ + do.call(rbind,lapply(colnames(phos_dat),function(y){ + pval<-1.0 + try(pval<-cor.test(drug_dat[shared,x],phos_dat[shared,y],use='pairwise.complete.obs',method='pearson')$p.value,silent=TRUE) + return(c(corp=pval,drug=x,gene=y)) + } + ))|>as.data.frame()|> + mutate(bh_p=p.adjust(unlist(corp),method='BH')) + }))|>as.data.frame() + +fullpcors<-pres|>rename(gene='site')|>left_join(data.frame(psig))|>mutate(data='phospho') + +#combine all correlations +allcor<-rbind(fullcors,fullpcors)|> + mutate(direction=ifelse(pCor<0,'neg','pos')) + + +##lets count the correlations and plot + +corsummary<-allcor|>subset(bh_p<0.1)|> + group_by(drug,data,direction)|> + summarize(features=n(),meanCor=mean(pCor)) + +corsummary|> + #subset(features>1)|> + ggplot(aes(x=data,y=features,fill=drug))+facet_grid(~direction)+geom_bar(position='dodge',stat='identity') + +print(corsummary) + +``` + +```{r HSP correlation again, eval=FALSE} +hsps<-unique(glong$Genes[grep('^HSP',glong$Genes)]) + +cor_hsps<-subset(allcor,gene%in%hsps)|>s +ubset(drug=='Onalespib')|>subset(corp<0.05) +print(paste('measured',length(hsps),'HSPs in global data of which', nrow(cor_hsps),' are correlated with Onalespib')) +cor_hsps + +plotCors(rename(cor_hsps, feature='gene'),'Onalespib') + +hspp<-unique(plong$site[grep('^HSP',plong$site)]) +cor_hspps<-subset(allcor,gene%in%hspp)|>subset(drug=='Onalespib')|>subset(corp<0.1) +print(paste('measured',length(hspp),'HSPs in phospho data of which',nrow(cor_hspps),' are correlated with Onalespib')) +``` diff --git a/cNF_Analysis_and_fig_descriptions.docx b/cNF_Analysis_and_fig_descriptions.docx new file mode 100644 index 0000000000000000000000000000000000000000..016a474dbadaf63777738bfd1972473b9fe079b6 GIT binary patch literal 27471 zcmeFZ)0bw!)298D@sw@bwr$%syKJM&wyVqNvTd_VUAApc|K4wXYi8EWfACHoXl@E6IX^qX8fR&;S5{7{HR~sA&NL0F*!g0H^?HP+d_+2Ul|kS3@-~Cvz8l z22XoCq9SlmssaG$Kl}gJ{u|FgQ_7^xAQO`KbJ$nNe9Nj#2UW??cu~ALuGJez6q|zB zqeyYrSI<5da|szU+xQ64?1s-QG;_%O-z+;AoUpkxDUPTIAV1Ns0nY>M3! zrY+%djzBxMu5N5O4iF~N=E0+Rn$cvqIe}**$XllFkv{W$4>=B&aDi=_6{4Ozy(gUSD9OUOU zs^=VL15yU*FVIM+84(wBJVcECV)ad;vh{4Z`Yx3a#!!2nACn>+~;aayncjVSz6AMl?%4S$&1xiB*Pr~iMl{BNwq|K-)I zlluS35>DtkD)Tj8Wgmyv>fXCs?EyRC*SHSsQ#(3iIF@f(WDv{=ca4^%)7X^Z1`sw9ZVt2PsVp& z3q2H_uj8mAY(b7|V(liwT}Rv;ec}6mLF)h1N5B%{+$<9SpzIFjx=9*#fE7|k3_ z-R%Fxu>SmLMPA0@-=O;JIpnlLzM&JeI1>(hrwBNXd7{%`-yElf;KV{0LR4l33*vQz=FYf_u zSzZglVTrge*X}7`UXms~_*}JuUaj}%uXXcViAAe-_>(~2Cx_!Kao+EnFTwA!-d?^R zpuCyPf@wz3f;=5L>(}n3&U}JAd@&c)+1m5{Q_X`!up_rY_)`cWx2w%-yXHtd5tY?r zuIz|l+2Y6_dDgE$+P}zyYnqP|E+XeB1|uMkqCPfZ%$*aXN>_YR_jN9-1_V;wqe68e z>e>V{o%c#3#4|S5M4tv={ncp_Hd~Ir(4$$eIFmZ~RL=cLttUyFuQ?Y`c&5{k`7&4u zd+?xkr=oUExwvz*cUR1|0)vhiNV}BdVtw}}H>wTZ|TYx*7lTNW6&yY4DSOyL9 zO-oC{Y+aHS29RwznqBYziGO+DQ+$HFKl`Nn&^u1hA2vWMyAF3V8a2EIj)*An!ev4^ z!F^br&9GiEbzWQD9J1k?IimtFSQ-7Ue$@UUYj&%ynQbK|tebfl-kRnUxX+eUJHYAu zDQf2UCsX8=b1`!Y5OPo0gJ}iZr*6ZN?+S3U1`JxTBSyQ%v|#Pe%~aryI@>0x^B`=a z7@7iNyX;H zvUDV8_P=Z`fAgZTKGpN~Zl5wtmn}Lu;VWl&g@EqZ$`~Y%fV9Uez^FJ^^n>MKHgvEr zp44+B$`13Xoi{!NdNkMllj>d0G3KS$LM))z@w6|izc;=olS^rLtMDar&`xG|0z%-~ zmmz-d84vG@^GJ%Y%gD@Yq0p5DzUY+=VD?iRpoK2&N8|%TrOwh#7NrdhO-8RL%fsBs zr86~pK&!X)t*he-`wMOt2zizo@|6#uT&;OBt2FwDHN8J64G%5 zzW6m}qfIV23l(2%sS6tfU(nug_?8fBqyd+Ht%mrBh1%SqwAo;?B;T~q{#E>b+vPlP zmC{2frxUSdyQ}&WAsnqIwJwH26!Z?{j|sSw?J%fcT$v!6QAblF`B7w0{w|3MgU+L- z7)A1+Zj3o6!tf3y6%A|ix-}ElPK6ze9ZK9rPrzM%P6M2G!6~v_ciF-jOPH{NRCd5xYePnDTCKA}O7%weOAAl&UreG$ zvx7$~Or+q>$emwO3d8FAG+>y!jfyz-=I%(@h)UpAlSnd}pse^PsDJhO?uO}mT{GR{ zbGZ8*T_7V!wjB0P!qnRXBSFk0Fdckt#W+w#7~n2J?N^z=HGOXBolzsHZj!KESs*8v z27Zo&lszH*a#0(v=1zc~5OlvSXFnN2^Y7?3Ij<6o5?LX7#q6HevcU>t5#4G zEG8b(6={=poy_rosxHXRb+Dlkh<5o5VGcH!z`{4~JQcSCnK-z!i0s^DD5;F?JV; z-awOJSLzK8?XDa{`z7$GJXj@S3}l_&N~DfoEIceaOD`;w_W319!!WSCgT^yK$*@ul9y0W2#SD45JVgj)tz}&fTHJ2M$ ziGt5Rcg3ACU4{YY4@|BwUSm^@?0-Nwnt6fH9v__Uj|X7PNEm?9TtA=Pl$Kh!XK(J- zO=WY4o@|J1HQ_|rlx-&R9u(CIZIP_}`tf3Zhp*%bm0me(W_W zwgY}GgQ?Kib**!ISUOn|fPwMBTcq`Z3%}S&PR7sw{^uSCE?-l_{CVJeGy&4Hxe4?X zBy9^G{yUAM0&E19{LlIzYdMkef_Xd5(!5Q1qAT328mM|z6{j+jLbj?wJ2A#SbEV5Z&_+zk0!=rnkRLVj?_xmtuV zlwTLy5|dpGs*Ide93It1x{BVqz@?DkN?b4eYD){rLs&Q`SAi*^k`npj0zd*{WzA+- zAcK(MImoqi6PavSG5r8z%t$CPvRfni8Jvzas@KGvAc_dN^QHGt*(b}4`nAnH z-gts$X4N85j|q>)5V;5gH=W)SZ*T&@M3|I^qQoT(=b6_R=olpN*jwBHvfK~ECi!A2 z5gm)NjkWVZva1MT5QbmxOvKMkMJ&hK6&IO>-rm*&RoZ z(Aiom(Ky9cd0(MCf&z2fnN1q!SYA3qODYOemXB{YZ#I6v#I1TL+4x0-ZLyy=zFwiJ zn<VZ$*GvATi49~4cyaxb*2>*7(}Q^ zC$yznsXCp;dgwS(s%_hY_cC(ljjAtcrSIN^B8)DwWH?DEMEHfwx04<_VcoXGb<+i4 z6*}Qgh?r9}Or;4c@sVyLD~ksc^dZV^XI*KOYSE)_alndh`=;=`c`rNJ2F1P{%BnLM ztNzVq@gb0U^el|_Lpg;u%Krq*34kR4W0c92E@LYwUBF%k0aV7D7J|cN7JkMAO(pw= z2C*{_`k9FoqwIX6G)BszB8^Pnj`2Ecr72Yl<@H)g@)KyC89j*7kxzM}pxovJ4{_%HY1@_$x#yfuKWTA&~EnUYifO52Tsv15~iyxVSko>z-5OTp!ig z{%lcYYN4g{6GSXXchWw%xlJG+(JAM{l+!6^axbmB`;TK)=;&h;Z?Kpc9GniTxoLzB z#6}Adm;FFF@-IEq@v!e0$TlbW7j;RTaMeOf9EuZ z8Qv=i(NKf-Y?^~EcfP;Qp(u@!Vl>`cn^RS|}Jw@^EpuXDf4CdZDHRK07W==H9xN*O!|2G?J$ z&b>a{1^GGq9?ZDxA#cP6$RQkTNXq<4lVcqx;Vb*0m*W1Ovc1c|i|M!5=cbupy-vRa zrz<$65?5YBoZ>3%=<{@*pyx$(voA|!7Bdosw3UnD#Jp$t@05M{Er(O^jf*L2h|z-<_;8HXYm@a0A7e^wve0Q$<}WA1 zOABj1n!pWs$)cOK1_yw;%TAcfHy32e0{a9aM4wEVQpkt6?Z7`;QLlK85y;id9)91S z+hSZ6gD@oFubm>!qCL>2S4kR?NB)1B$b6xRP9xk`o4oZ&KfA5D43PDJPIuroRs05R2Gahu3cKPTJFo*@Z=xp_3@Y zcLp8@C}hOHKZr`fr&yRrT=6l8ktWjw!Dx_bz*&}dAtDybcp4MNQ1lf*2W!!ZGiOUS zi^#$^{)X3~x1WS*$uVN(4yyaPR5Nlbks(YxN-V2AmI1m#_9R{^ca=`PVpA^qC9^(< z)*WxwLHPYm;s67zCb~j6+4h)T>mW3#zhEn#Nwsk*>DKaPls4nr8E_X5lJ)4-a z>4_bQhy|%8aAjeY?it|}% z=>;hDyi_kLG*rXB1&7RywL5K`tkEmi0mN5;z21m>Fd=4Z49_VArVc8IR;r}FVks&{ z7Et-%y%OpzGI;BF6wcDkfmQfDc4|Rkgp^kgJkEv?h{*>0b7yRE^%(zXy-CJ?a~mMe zUyV6Wb%JS0G&U9((?t?j!hG8Y*zx-+mZSDH;S`FfJ*N8Gve31l`{BCS8V9TfMt{=CK#^LtRNGE=7^JwV^Vd4&2YsfskF*7) z=0}I&llS1Nj#1COgA#=tEzC8vE1}AwdX5|;OmwHLIlIuDX<1w2_je_1 zC;CcHUD+6_4S))dWtquIVHq`G85(SeISo&d>r^_9)#}l?s91=F(bGzO9_rJTgCgE! z)!^TAOtm#WpT$@nGhgEjeN8=P20sgabp!N%*sOs-B2xakj;n?Es|RJ^&?$)`9B;w=Tz44Mj+s-f>5-_Pbx^$|bR2rIhL0?>=$)%m-VC2{Y3#0`-2-zCb5 zTiMiW>c^%H6?0MlDzUZms?oM5L@h7B+hD@uzIL$dpCjDqK64cgL$gI+!gTW&5@tZK z`hI|xW_w!&jB7LF=V478)VKX0!Z>5p22MVXktJ3k?d3>3AwYp;(r8Ip6m3+xUa}x& zS13P9YBAu|3H-Ko2y+O1-PdZqd%q6YMb_dfzV7{Q-ec6CW}3{tQEFJpWPCQ-Vdkx z>2W@ltp4EFW81}Dwio+Vtn)fHHDTy+Et55obR26%17=<`1Y&m49#5k*zjo6dp1L_L zZ+;pwXt8-%uGHN>P}k{4^xSk4`MF$;2)})tYKtek{(QQkAgQ{ucduPsfsL;oaZV(&U?~b7;ND#~qz-B# zZ8ncCM|bl?zK8>{T{nTqZ>&QQ2FB8rGw5@=u!D+GaESOQFb?aGY@NqxR#L;p>okpb zu8RU1Oz#()W8lCVOG`eCO ze%0!V_`XclG$N-L@{$?1+4@12E3^$-4;OemNB(y5((+J4m5Y9tf>!*-^o~d2=YgU} zjPht4il+YFPx4;4Lj#?hC>yz~h(QgBK#`WO?{h_$J>3WO$NeP*st4cfp713w{*)2U zs959{6)Y!Oh`g;-&G&YglgJqO?$igBU<~$F=x1%kZOpM&$PVUon~Ti?HOV=@E*kjC zhwBNg;uFp`xOgSV&RaJsDpaBD{Ty;g?k{0aXynI?f^E>~3>%MFfEk!YM0E|O*}ZTL zpvL1UoZJVwp!*nzXf#x7wjRrU>Un7Vb0u{)OF?urs+?hRFA4QDUY138vzu-hsI==C z9#0rQowwjHDKD?*k>ZY;e=~YTrh75U-QM^5M-Ws|x^;&z{=0tOW<0zQl>c+U1b}`4 zrRQ($>YB057+_WI4WxJ;A4p;GK2C-(%X1v0JYsD4jb$(OM2!&Q2$J^jb`ZB|S0&hP zCUvwToPj)A2JpaN1J|1l1h)(dSpjYwVXb^qXZB0{C=YnER9DT4iqp)hw&5}Ghg3qo zDPiz6I?>%T4Y2rAhyoQfNzJemWkcrcjz0tvC!W7KSI?sJ@gXa!o~Z5jMHZwG91ySG z9g7)gLyCo$&jUFYgMvHvcN7sQW;)vuYLwvzOD=Gga4tU)Il9a01iT zPd+8K^NG32E0}wb4sAuU*@dc(O_{dMb(zO5K){Li27WI)52<;Ld73a@OrL0^4!Y-&N}j0wrXQV{r56yd2{?S6yAQyB3ZLWS@a?R0yG^HJ>UA zlrs@kOs4H)yZyk<3&57hbyIX}Ut133n8o_&8npO$Waab5t~2i#ih?{v92g}N9bmNG8Vua(sapWj6+SvC)3mw`hv#x##6kPzm`3(eQ`; zBTg%5HNxc_cQWzRcMeJzjL3h60>jJVhQerX$W)*n^@B_FN~}{TpM;ZFWX5^&)odo< zi?rZ?+s>H@)5_&)MmPuNb%o04S9}LBh4SPM6CL}}3qWbxG2-%4p_j*th{63Zv)g?E zEP}m--BlF=!s#>eF3m7U3HIr4r~5x{;5i56#)3^5UDF{e$`k)@jX=(Z7w2|uUe{KJ3{sg5_esLMK z5J(Qr5C|4T%$a=_*z(?-g|kS&;;{2u_ayrkG@2$G?O%0hIR?z&6Ad@hPT%Z_nARZVMBL=#_9T1 zwsq0}>MGdF9el?n&xM$zh?HPQX+pB1{^#^=!3r)?9j61TEfjaj<8@v$<~Dy3+IIeQ zCWI_nF8Ygu1c@tXo2*DsL{U6`TGnAb?Sex`p>@0e)?SKWeJIR=LspM2g2nIDoNbkG z{G|&UU0+WOUnfd|BUOw>SWneRDtAjX$wnvo6nP66oG!3l<_4y&)(l$o!NCkD<;tK? zPURN*G@86w{)@m)-i!R@0SNi?XDFE}{Sk=vMeLKPAgsHLwmQ1JZ>_ev(pBS;4P;ue z)8*CmegL{D+SmPyos^wpmZcWWlHUV>@?OkIt;TDayH1Wzx?PD`( zwsem4HoDP&dE8{1tU>K6!F9u&9@d_&qk?(Np%@%5z1sY$YNKBl{g zC~jv484x#mB!@4``5+Xr3QNh!JVlwN2b<+?AWw()sLM6?09p?zW@!R3_JT*Vz+GyBM7ldHeDS^<^`D zMp@&A5&rFV(9&dBeuHRVYTWUtFa>Hn4i4+hw?Zwzd?@qSZ5!f7cX#)7Z_W#6xXZLz1iXh`hJwRDC1Eyb`cjNW-Ce7{dvr@cF%T$ zE7VPMkxXmE*_-ZHQjW1G;ce%cdvMqmTJA+ZY%ThjjHD)4&bybr9`WAS<^F<7-Vd-c zyPI>M!;`g7j>f@WgNc^Q*%sV8DD^I@bPPJChESw3()(KP9@g| zi|=Vf+)=D35`H_aCzpO>r0N(xJLw2>G&aIX-g&`z?mj|!mGu7wS zucVK%tG5H&S4nov)+&-B^^HezZj%TS58mJo-Nfra=qVuiq#S}{xD0@1B;?~LPM z#>wibNuD3jID%_9#YumO~_Au!#Sh8$$D4b_{xgfuDbqm(jo%N{E z<5?+xTD@3VU+3^iW~o1za(HrYXMQtD^IJ!G7S6&$yCz1@IWbj+WpL$z9`P)=hLJ{Zw=_q-a;Iw6}|az?8I5JR{7qkzd^ zR}Sz~ur;49NcYh)ulB+BaA2b%DYEX4ua>h?rGj0=CU(pbY0-LMtzs=NyrYmDHKs9)BE)B@&;8)!hPtw)WlU3 z`VYCR6EyX8_lw(|8acN8Bprvx_!8r9H4zTtaVRZB^4V%E{Se5M(y0pzL3McP0CT<) zfmZ0ogYeek^J-}^t!hn6!%q$jv$xj9_-n6?JuD6ua&^05{>UvU1Ao%n@C~wND(b0w4_F`!9eL0N{WA|H2u;q(o zDN@(jY91-=U$a*{7X)IHl~%GkicZmYCb|pfSIj>tcD5+i`S6EUWPq)Y%TyGZu@iM- z?y4QCWNi)xXae?rEEMpj?WWl9=v%HA0pFhijLaK2tPC?{X!_}QZJlotHXjJJ4?(K> zDD}gF=RS2`vuu@N!Sp|(@s?+gFXBKY7GkAL+XGPQDbQf<6nM_}vr{G5GpDa8kx}ky zv7nRu+lP+x0u;~1YA`&7W<3z`^t|6ad7Ig?0R9BU<| z_h_zNRX5nG^wjtIwQR6y;YzRcq~rYH)mReoQ?Y|E$8$=u5J`8)sWHIfSh0BkhphbU zuhOY1*8~N2Qnoy*gU@jL) z^)O0y=Rt`$c{Fs`4oGj$a%QBQakYLyK#xI0{rP1ns7j^w2#wnuOL(R76;b_grTkFk z66he2B=UUR-0n1V{nDS(^vtnp3;6Q|Si)IvS4%1xr;&HM>cyYp9Iow_8t)elOdc9% zbMsXLum5)9hkP0TX7~4kgAUV3r@^r9^N$F@x0|wbtm1)88kTkJ@DxX`p9d{V68(8y zDp{UI{N?Jazs*~+Kc9?b(HUjXyNTjLu#qKn`YqXwFJw(;b7x5ABHzSz~NXRvBDl=Ke(GooFS5spo#hunSb zLv3sT(Xi=K=;?kK@gcHpT`zIrqUoop>(_$k31pkygLyN=F4wJ})21frWFy?#7Y%k} z4WGjrDT0YDB)0I)ZaJASSF6|x`OF{#?YlJz-jh#9q zZtHd?;lt!@x@TMi8_bCpnrJkyii|QRl6~~3aUan)X=t)@w2cr8+%^e3zZ%AI*| zJJzh-sGq!jcm$*87_)0M)-Q<7V39Q6R{z?UrxG!6>gxZ1Sqbrcy|wq>*4;}Z%!m(b z9e-?L!8OXg;a7T8z??4$wA0yUOCc7Y+QMD?`&qg${YEdq*wCH8=*sTGcomay5YxI`9s|MC5gn;mvB zLja1Q?<81%KmHO5?o^M-*pk92?Ql4(p_l6M(xMb}RK<7{Pc_$R)@IP(3$zU8e%rZ= zEcMLlAB|MWsXboNCivYq zhXAWu(Itd_u3Gpny&V+668+Lf+_7s$mDHbCTf-d{@Y(pAY*Q>XXEL(*O$P><8k(ny z+0s=W7x_3{mRqc;>TtXQEhkfko7U^J37}yJDi!mlBjuz+~?u zQAFF0tkxj|FlJSw5k{!VPrZE4(?zoT<6a^Qb^ZzROKa4bjTj_b-tX2Ftq>GP`iVzU zWfKD9`udcYql08})FohW9+2Aly<1I;diW%f5knW2tW6jOJ=@E;dYId@dbgmt+G0PK z-nKd@7Z)%o&*h2y8z&QAU9D%mXeDI+9Gm!+ATBmuz%8V{23l4rMXk20WA{1WlhA-k z*(^qEYkwEK1f2WKCF^L``6%5@C$_9kas_Hx1pN{8o z^yUz%0L|kn>=wTEt8g#1g$t@AmRAq|Y!26*_fzK(;y-2BA7zBwpO3_9hohWDkFZna z#R%)Gb{Ch78XOV&@~0YEXXTqSPUR{;qkQXU(eR$BqV@L#hh`do%1DCd34)Dv&UGG;Z?i|1cQpqIan*RLYc4Rxeo0%nnAjGAK z&#))&tM;`BJNZ4-qgT|8A)SKu+b?$|ZAaLCSYUN&qK^&#K$CTnH(^Ro!rmx^x-92@ zuW&q;MGBycLrqEhSo8LC)g*8_f8P&gwIrXzpP|lR2HCj671rq599fV6vY2e(A)QMq z*4N#AEHcA&z7+fQ$<4U;xAIAx5LB0*e(_-cC_q~8MgO8dtd7*R&bGOCLHE*Maq;|S z2UdAYqT8|ptteg|~pVV#(oedmWj&BUc61SPIOdnl1h z(J4h2f5>~%c$=`YPb@F6(uSVdlGyquinxL1uTXHPe(NFCNpqQ#LZ1Vh6*K*K(Y#@M z8q{(NHT2pKg<7YdK8}=^`!H#-5yz*_tv<+KuJwCD~?9BuLXCknQe<#_n7qy&G5s^vkkw{XjIFXy3qFM<}$4E9ObJyMz{ z-W;a;ev5ByjDd}FfHr)(m?%T=P65mC?pPp%M3l*mmN zu5Ojm)JvReFVu^kH1DnC9TEN&-c@V{Po8w~m8e_z-d;T)XeD$G<=ylYKe)`yP8Fo9 z6rd81`s|)MYGl|QiY=c~N@Jb|=!xxNDgw=I+I#+1QDm|cbOzgNCVp*ye|AxBM%4Z3 zDLv~p9!{`qim-{-%=1asdC_S0B(COfW)OaB$Z6uI{%NX#R=+k@{P)wDNV|)CF5sh+ z=}49pIMhtL_4wz}WU4S>hdW`9E`9PKQA!EcM5$qTZu|TFI}we$`9@U%4cfaRUzx|H zw|-=`!88HjSLt)^zi%-8qi1;C!F7W$r(UqiF;5e;1)Ovs&9zE9D%gC<9X|QrX_IOU zPqz?2>r!MsgI;hWOGg7`E6~>;-o}2sXf@Y)tMQ!FFxzL~u1<4gW<@S5^SqTZrzKr3 zO9$wxnp_zhLtqQA$a)c=ITor)Qf(?QVdN%5;!K*;%(}IP8~$m+rTye=1TaP#IlzS% z0cw4?;kX9_;^S_*A#+*&dA!{SzA|G)`T$TGfi3JTA1^KvZ1A)f$Q&I2L`ThO&Hg6QS0XwIm%&PM=7`RrfV*AY95#xisqTU~@EwkGWc6=M-(V#-UHUNaocGf}GdC4! z!&`#C1}Gr5{JZ9>1Dw6N=NMFJ5oynw^27CXlcL(px)!-x6N=6^D-QV^7_?o5 z5CS z;K*(Vd{4#N;8tLm=VcT{pJsk}$MJKhr>-i3Qr};q9yowq750qyJ1U zzA1RVG&QaB?mpc97`-nJhf^eZ+LeWGSc30fV{;?>4Z2pjsWEHiHchiu%h$1-U@yDU zskX~asq0)N)S|oBnfYP>k=uO@p;b*8dE6T5Z%dx9Ii=7Au%NBv*`|1N=zEj=*p_S% z$tfD$f!c7Dqmta;RUhmS$)4nDvV(<3hPe~*eEWL;-pfmUyqW<;^uuo=P^`(2Df^{9 z8PIpR-8oPHa@OK%Pn^eDA9l@*09snJlz< z+AbFqh2xk7`i2GPR(9K4S|*v4XqGA8%k)78Zm$zOk-KQKA??4$wNNLHudbkFjXr@M z!JpAh=Wi|PTy2U!89%HD-_;3U7Uixid;&C4men#6SARZC9g^#6iN(epgGgldkHbwi zNGR`+Gb54?6$m>x8X#aCu#?syLRlGm+7U|SS0iK04k(RLJ&G1p&Fg5@M#RoOd3EQ> zV&-v~o61p+lCVKDD8HA$Hbz|KYCW;q4fE_$b|1I}@L?2(HlkU*vh~J7U87}irwt(E z5=Ou{mJEc5+CoHg!n>@hK&E#2n$FoF@{Fid{-rhdY##N;*&4$h1zX(*YlE^FFOE;K zIWtD)V|!LQKrB#!3$B7D-VxhG;)@e|Jnr9bNnvGcD`w6(4Syw3Og*awk7Bx-qMK_#?kqtFhbC@o!Zdjf7`HV_NRHITR6sH;FzoHu zXTz+BK61x+4MbPdI#@ugUL7mhj@f<=@w|dy&g)YDMyZ9)q_z8`=f-60f!z}?QZ!hO zr?(wQtBF)mB!NVotQx`y9TB;aHv5))4 zqHIKF5&+QF38i%6PG#PVj3UPiALXJ5b(VOrtn1=ymo25d`(>WoaHM&lUTDlc!=irQ z&p)3HPlP)>9g_FY{8_?qYRb-7+rOz<8T1qIZ9mD?>tZNvT=0PM9<$x$weGfz5$vVA z!P8rT8ROoqR{AU9gr5g;EOG0HJ%u8>`*s!b8dY7HtI(+GjNij#*q+HtMRZOD3&Z}L z^hGl=C*SxWUiVB%{;SIxovnTxnqT1%r;>O(kde9g~8e9$k?Di!U8OC`Z zU;J-7uKl%Ux&_v*p3$2q53 z(m=xXTYqDMNmpQ;m6r2W6+-jFPyD&i^bsC(J5V&}=k0pm>jZei)5g z7j8cp?0kv2bX|}pGmy|MZb6+=dM!9cZ}5@}>%(ESD`7_0vO+wgU~a<>-}94mGANP! zAJ^{)|9$)a6QZ#U{Aiki1OTRq008uV7XP)3T&>LQ%^ClzXZ|mcMteFMmlLfQ^Gz_a zotGfKH{CNtuuwSG;;RVhz>FINxzxVa&skY1G-;jhr)FG}?hzM{bXZ z5RpL;y*?{Ym{_GrZK5l^_0NaoDkn*BDzSfAf-h7{j!>j$nbv{n8vH9dPO(&+JTp_JvSc4Bg7*jquhX3!(91agRm=n(UXDue+8n zi<*6PxGqTY_|!{wgse}4r9NAwJFU%uR{h_Lbhw`IG6c4|FEYLQ&u@?hA)vv&9(X0x!4%Qg<(t)XS*CKMeguTlrYI?(adxKe@c3}{@SCdkFqc(QuD7%3qH zN6bM$s3E2zJ@-z73No-W7dzR!*s4L`p|dn+&@ zYaH{ty}vFFJq$lz7gd2>(>qi^!aXEsCtz+cFIx(*NiI<0;*u;JWJvp``+*CVFuAGl zv$#=#BRx?Nh8|R=d)KI0M;*xT*?9N?Nsw%y9mp1(2$-i3^{JAD{-Qj)<}&R4SgYd9 zk@f~Yw4$Ah2WQtt{wT{;y}UW65ahHdi9UiK8ZpQptXyI<^Hnor@EqD^-2Pbs;J$^)n4*6Pfmb z09sJ&!F+Hme25D%|0Su2a6m9u>-dHRFyu;}>OO7h63PrYON!!VDtsd%#UhcP=NytL zrY6T3&8$BEHWN-68lj>tso)4f+5Qv?>#*aUKq9&Jx%D(Mm&%u zY^=0vT7-1C1-nH;((($ousa?psLUxCsqH-zUeR>$i24*aXo|*X%px0iE=GMR4k^7k ziN=90Zg>NFjr3p8CpB?2VLyvzXps+|%Nh%G96I=H#|%QB%Km z9Yipz&{P{2pDu=s-CoW$QBhT7Nd6?U)p*M_Y2Y72%c?9MT$a*G8}2}7_N2A`%hrspHeC>U-OVDM8r zgJ0XmdveglA9(tv9;+y@*S4IQzE`+2%f9j#yyzXO!1|ohUzRtoD$TRPa_w~auC3~= zI7Ih1feLBQo|Q8T9@#2@sHW|j+IRB`ili+a{%&afsug-e!vG9eeJp zP1_HtarVvl*&lDT=VolVMfGE+(%C=gluhvc6zgo}+R6OTtfNc8+z@X7Bu z237>gElhC>{i(2?B_zB`n5~n!@9C$J-pp>VZJ%)SDjRwygvPcxHG^ayUKOFgWUtUS z5>Ci;z|Zos91_om%vP>Kh*a0F(A8yZJw4Hq@K@BRav^Cer3q+eTGj7PlVb_G(l<|s z0-eHk;6~iE>0EfM+zehXe*&id7^Y{dy!=t9Uy~;LX)}K5)+M<>MFK6mWOfzwQLl(G z>G9kVaQ^AnM$`N|##-xF2LgkQ@#k01!C`MNl$Oqs@iZ>mtFI|Wo_Gj zxAZ*Gm9=v9f2XXQ%y-eS{`~-f3JLH(DN+}6S66EXOPBuyzb4J?_-rmDf5FqQ2&&n1 zct|*KH{C=+z1A}W#c7S_i@;#+z zX72(K76bkG`BYYdd76FhD)*IrMvqf6ocLcP>I$$1^XIpT6StkMz28qqNV1h^<>Sao zYAg(m=eGtGJb4P{EoJ11nq8&nfw~MbMLWtDrR2!Z^8DkT+Ni@E)kbGrZupc~9&8`p z8<8|uR21Zs;OV|qP0~}Xx@q%Qzgv-hueGvk-&IpvmrTW}3IlI~G=I$6tTm7P1Q(VdWZaMBiKEuayRZ9T$=ewk4DT!28O`A)9m|fU();}^`fYvB%8*&* z8-T5!UtG~b9(D1PPu*;?jh?~uVHRs0l-AQdU`Cn|h$DbZFi9M7SE8F%=};JNM;dA6 zBcZuqjn6!zJ8Y}uMYQ8`!b6{P`I9KK$V5LGE^Z@OQ|6{H^kJ$=68P~j^uZIKthyO{@r z4dQs^f>yrrLzEO9Ok$BDU?Ak0TZp#%o^tGYv%Zv?yfTRCvuIk5Vj*~`RSufLp%U-z zW0K?FBcs=WkVb8ZyfXD8Qp{XakMGWWiobIteSu9lZFKQcO}=w$Mx|fJKN6UHx@59F z3h*-;X5jr;V-#?yRj7<3gmB@s=3hDqtWfXg zjC)JBTIg?})Z=>c!W*X!klpMKm6U>$g7lTSU~3Labk+ zFVO4tt0E8@9T_*e@J_L3teYGKnY~R5b?U;gw5I2tdUa3Xn)x%{Lt$T;dnQ11)6F&T z%K?gvmiYDLKUF4MyQp4*j;^6T!ph-;H?k*w0HMm32 z!7V@r2oN+0PO#t!gS%^hJIS}t*?Z^gv+i2o?>n`oS9iVrJT+7EcFkK|PgjAl{09%X zpxcBNTB~D$6Izx7Z>VNJy7OU0KCgiyr?@7~@+h!z`S0PrqL&=?^G0Mreac-N@1Fu7ZdCFVPqg-9+M%}s*R#gaCW+NqhHU%%q~s{@IfRT5+E(d?#iinI(rB)5m6eNm7?=X;;>5 zYQ=tY_liw6Mg^q%L$7S5Us`9BX5>6xW5#WKZ=WN0Tx;8zuoU#PD7U?=gz1gMfJVzH zVS`wh(X{RNS^|+zSi)h_I)|CI*ctN*1`BOu5hE`{8?eP6D61~ zb$aJaC45(&-&JRFv#JCZ1KHrU33-gZ99e7FeJzbx&3`5|c}&Swr3fY?E`t`tO;a3% z9*TK9`nGQXqU+c}6|3K{(a*{VX!mxXCVO06lOiU@LKs`7z9_he#wHh7rmI{}=iMJX zzz-6e#G=2V$d5NWnQ|$-&mXv5E7`czYL|4HQ%DzY9%m!{as4Rue0O*@FJ(Lr;JJCJ zrjaX}2)~bS)JfQez!He`^}&MaR(vVGFaWiyg@pz}jyIa-9(* z?00&Q&&8arqIvA6Xp526m?GB$`MSYmRMzKbt&^#k+AuVVi<9?k$G=&=%vqWE8NtD; z#fKECr$16bL^=8suSrYN^iuk#=&g&NyhH6KVXijaSWFG>$}5&ud+KB2o8; zGMKg0#as2PZ_;(C+CF2)&JPk>46vxRsa3PtnVmi2#qpg)7C0M0mn`a`nG>CY6oKcu zbjE--klG%L0GGU(l{Q~ED1V&Q=hP#?P(W>6>5VOCvr|QBwQp|vYJ_@u_ z-e*b;TpfhRt8V)C3DcUgY0E{v$HJV=@(vZu9pb;T6Q?x^m0Z-``*NhSxj7G&6pf6vZ{g}WLTn+v791TCER}@Tz^1LD@6mjL_wV$&0AEH|ahyWtjJa)6 zh&}j@3-4upjF*y}xosbjEe1V}+uiu%(v|pYm-%xzi{)4uuCc?~Ogd~5YY44-B zFRnH8J^NX_I61?h-ey$vcP4^Djz!GaK7v1aY7#(r3q$%W(h+S_%0};Ejzj^`Mvizb zBuC_)z66+R=IwJBYUbZT)%8UAeLX3!r8rW9+^g&9`8-DuqNvF@j> zZ5{&;0gr(L84dp5iTrUGkV>NHtqJ_#eQNjO!l|U^hc0ZfNuA+)x84;~MK4}MWA?T; zpF@hr0#vZ=_;udZI|Xr6DgpIsH3|FqTN|;b8zpC&n6iIhayKVrf70fa(PbC}FQFFw zKy*Bm!=4VuqG z2N}46ta-p{7Hw{~*Rw7|Ojm<%0VLb~6W_oo#V^2xQLASct6QfdyEdB&(#G}t=Fv~D z`zH{`5`*v73m&en@8ygEOaNRasq8M@2vUXEBHa#QiP2tUlg1w`4XYK4!lZM_#O}Q9 z;>L}8wT9xD{qFrSz9cMe$F2Z?NEVe|W;JF^<{0#K7m-=(nqA?3B47SfVA4jcZ`yO) zOQZ1vIg*+u_$Po2)DC5=<>dZt3Tqw|sUhKEAlF8}+KH)SB7Z7BZSI!IkR_sP3?doA z3(%*o?>wWN0}V~p&^~+|o5q3YVu}P48+$m7=$B*(mBHnbgYP5K6y%ydbn8%2vl~3C zM;;SkSg~On=I>ofhg@ErnS1z%03w zc>lR;``li4FVU|k_>;!1oT3oHcO?lME6p`~-rLMMx%QHW_at^nsF&QjGy})+dv%VU zNPR$A6F$uNZysDb)&t+T*x)+ZA71*dyQc&Pf%###1WllRJ!h^;weHzJ! zXQ><-VzdjPyde&~&GSZIIt&|lX7bEWUk;TEX6BaDFg0Kl^}ao||E^U5v9zLhWTg1; z!>E@Aalo=ZrO?Hmp?-02^RT4u*jE1&HmO1zXY0x7LZTkW6LKr=gnY?&?BGE1uIS<@ za5MF!9Mv&5W`6LfUz6U$t;7B&Z#mXM?R%O$l_pYVzq1{KPtVy725TN`_J zVf-0X3erj4I}b$cNlAfmo;$vuJG|v?@&Z7Tc-s;ViisC`{6%hj_aFghSjHkO1ESY- z5A`M_jm9}X*ZcS;VE;fAe}_WxFjyJ7p#9t`fN?I{?kLG>Ptc-D?oH+VNd39RK#K=+ zQRKIX)!Y{OqU>1*QjO+4Va{9JEfR_}$c!)WL?4Qe?ifF7Vt%;JuctD;Jif*^>HdsA z<{6_G!G0=pY1a6~b4vD`fUoj!bx;nL6lp$=P}C6hNj*6hxwVC}#*mMKx_2-YX4=zs z8+vsA3NFZ(uw4lFLz-&_bHZU*_+zJ)d4l4n90j(knWGd z$AU|?zm+P#oJ1`u?qFiWwX5IF{KjPQ8VC$y0%y5Vq~yv9EO%Y^D%W!qN#AC=c}OaD zN8^DVTc4!hAUJ)7_;N#WyB(iG?;@=lDls*pcOsCgp9u~R&Cvq(Xm=!{UX(mJGtk?* zDbC-qcMyf#PljNdv*<&oU;WUhDHC^G*#SGy{emBQ zfTSo#<42x|y70;IrpQkP!x$cC*s1#UQ6wI{5TH3fCTNQrU}XJl_ddkR4JaRhQfw~s zN@L5NGZ&g+;v2*H%mlf)rH1Z!~qJUtE_QS6`ps$U2YDtz5%^nHoD5Jf6Waf zI39LWb~ec3RvWP(Z(o{S+S0xVBwgW(F>TX*e!I()lG2f`)6;3y($!tYTkuuEl44}} zl$9$(N8x)fQHq9-#r}lBz_f)B=Ri(YU;4V8g3{Lw%dU0n^!HYNGZO?Ye#;Y@yPiXh zD!ANw-m_b1Hct0%M~|sSeP%Y~9H0Z%3siPab89bcoQ#{8)4n~ynx01Zq$)H5*>8{j z=#;3DM2XDEBBW?z)Ji{(bP*jAKb=KQ$nO=f+s-1jIGeJinsj=pnb~$XgA6{v_3B6? z1oLl7Gme(uhF8_FLTNsZejmV|5IKO*i=xsA0l`n#_!YvLgcLWwu0?E7l`P_I*&>gj zL2Qxt-6k;RZ-MMltAsWF!w8Fbd0<3mv?g_WB>i%6YYbI(VK7#`f-_XJln!-1?;=<|E0Kp!tKO{|9_O{bZ^J4(dPV?_FGl{sMtlj zNn2yyjdn7wxjC> zuUkS1ud{q^qsB3rEr&9=z{P2TDeVC#E|Ygipk>I& zFhQSc`rKCh`9%VEop1}*xZ09d{I**LPpl0EsF%RcF)CslM{Gx45pCr)50$NQ62$)!ZD2puOtBc zpt*D!wiosnQ;OKXP1yMF%TajlwPjgdV5GFubV~AC*!cCoNb$O3?ctzm0%0d*6f(^k zHC3zz+QnqGSzOHFNWCR>AD*HdcSvNH+rX|TQ9ryA)DJ0BB(KUxy4W@IZa%2*Fm%wLiU`Qof3@Jy zj|kUw4M}r08(7o0E0ag$&#xq96s&%Zqft|r-D+Gv?6?H6Z%!nJZs;DESo%@v@Oo{ccp~VeHj_lctj@HC z`JKGpHxV6$kpn^lY9m6UyR{m~-dhy3z5+To0jz>LZ~}KO8CF?e;t< z#elxUjOoM__XQ7f*L4wByu@l_zO1M@;TO}p35r>FVP9^jrEU-v+dIEqjV8ZRFu0%p zSlqr6UOC%xrZ?+hpcGiPw(Hq6n*VH=K(_hEv7k1&X3`*9Pb$KdYmE zu*0EeQ<2Mk_ZzUw&BNw)I1tcdSFl}$rTPN8KbqO^Ey-Ju5;p?n9*wDFD;2_e7fe!h z_|)eNHksG@xbiek``toi`oYB)XDpQNNB4V3P4d)`p3HMs8#E8p(g#xciPUV0ve+;5 z_@ylbv**=kmo^YJCxXxQj~Uy@Xz+O**$a@0?u^Mp^719trJ2Obk~fQNl!1wy62S`o zI7UUXUra0QxF}SkVbcXl-3VWG0A)vCRoJ%rX5t;2#=5FSM!uSiqT$DruQ zycewqZBH?^7vmxrjec4W?&u2Rq7f-xQHy^34#MiE3Iuua9@_1!s)zHRCdV!{2s-3M zkQqo%KBerT7~~RUmeu(vg+6RZsI|^5;#o#jOCpgr_*qvk%05aq(z!r=3!mzDcBg>6vvJ)v#fK4^ zpkhleUR+{kn=&0S{Z=ft&jJP$anzZ>XFj_Aa+-uEok!$kiUDGuaNQRY1B%OU@53Vu zWCI_Cu!x#w_764Ud6QQ0P(Ix=_D4Tx0zY{RMk%gDQp!iAy$U)4abyMe)eSOG2<=#Q zx+!J$PTySZ>a}ZZm~yoH|LkW5R55pO5?5R}noz66Dwn)_;oja>XBiR13=Pm7Z6#dF zMfZwuiOTDYyx7>dYj?o!jsgoJ`75nh>;}8cUf`SLB zRZ?){0fi5yfof|dAXa+J)MU*q6r+O}K!O}od{V0D!K}|3hu0_OgZ}I!0B%jn^|Pui z%IXx`GPk;%K76v!3jQj)AJ+|AivvlgizSAIp>~F$v72vqt3V=LX4g2&lLcH1{g4Kd zN#V;*GQJHUPIfyNiK#7*Y*j0Gsy1`OyW_Daf>(Ekp(1lo{|)F#%w=2$u$vJbH&y^7_X1mU;A1%nqf#I4kQ#?I7X_(D8Ntr9Hj}eQ*w^G`_XQQp9y{xqJ zv`W(SE6!E{JR|&q_rU(LsrH7}Kztv8V-K zVE&UZt1D8zs2n!)w;9a(?*SES*aZFBX6BAozXnv&3|toIh+eF)>`Uqfy97mh=%iGt z7}()1^v>9gAwkK~Vv*M_4t+kT?o5G*EA7?ZrjgNp;|rAdmb1WV)1A$#Qkog@*^3GV zXqu(fkQ>oeP*5vx_6D0k#9Z^W9fVQzeO}O=``aucO)_e!kU|zJwcan8_5^Yi8DWe; zoa_v58ehjSBEGXHcPlY|Hcg!VUM7Nzwb+`p)D3_}8P^uZ5c|^ngzh+o(9H1RDLxAD zu_uVV1MI|A=u|Su&dfVOu`#SS0*#76eDqnAPPHLfpJapL(PZ4G!Z!f~?F2S;FFI-I z_Tf-69q-7BBk-li1S2(c0?|(GoZV8Aj^+?GGhTMt8IGR|h zM|=7yb7a!;I^gdjSrBwJW%a4)Kfo=z-QtMk2O3~hjK73OZ+!Bi4}2oFn z(s%mVGTn~w)g#$yl#Y)O+~Dr!kHamwjOpnGKyU2hO)uPNN10m4#p6(2_doXXG-A6& zr#?#85K`WC4HdR(58`JbLvh&3o@yo*fGj?*O)Bzc@JcIFD%ahX!>Zpv+kJ}A{>6=H zziFFVp&lBSzzC|s2JlsKdO8BMR6UPgIle4NW^~hxRVk2OM5HGFp~-3mZicXQOc%k% z^FA@gi3A1u`wKjY{Ny^-bx20uWz)CxNMO^Q(W6x}m34P5E}ou@OpJX-e#7e3P;R-F zC`KFeH94t|5vksx3FHc~0PWUp$|hZDQ`n9n;OfVEV?l5LgG;z!qalGLy-!*a*sA1M z{A|Qfe+sOozlA+LKO}1(nVk?pjl0z6J1gsCwwFaVDtjd5)jxjFmFSvqg&n8>g^T+yw}#Da|9@gtSk>hwuc-C#;C)8vwHM0I!XvlOG=7zN&w}N=F4>e4w zlj4XJd&qQ5Cz!fA2g_9}o*bE{qCkbeeqaaXXo|~IZhFF|zof+L!vW9-xs}Gzje@P? z1quxsQF4J3)Fj`~7!$tf^GtOxh$=prSK7&YOaU*?OGGCn8Vi+#W~-0P1hJBcM) z)S=)iGHL+hreB)srN46j`X=~+EM&A`CA%uZaK>0e%5T7S2f|J_cJ%lxX^FEdwCw_- z7O0Gk2UeC~)aI;77V5^riemG18m~2^>cuW#mN6B|j@=Df4j z5);oR2|+Ny(=w3)nub_A*RGxpiD{Zc^HSXqcw=(fNe`IXk~W~Ho52#bp6C0J;PUDe zPhFPdY0rR{pVS%qK#`~vH#F```jb4&4j@KverxR{#zCI{em6tT^J#%sLP!fT0qZQ? z?%Iab$02eC;N-5W#azHP`m)qvJP^-hvcP&qbWbElcUd^KY(hLLX6x}cgVsn^XJJbC z5Vyj3^G-t!5lY?_D$Cax72!q39PE`NWcL#R#c4qMx~n1`*9k9O08V~v$p<~-o^2X2 z^*U+jW4+I^_j(2=>SNKg^mCZ*v?Y?jW3wb>Rr;03cBZl_czOKz!nZ0%sITol7*wS6 zGu9@&84qXJq2*uo(+f<&*W8_1*FfRqOt+XL!}@v^CCyKPZKrk&ciKp`8(toXfRGU` z%i~KIxa}-VFX8Pe^doXkR7lL1tj5~g-DZ>TUVRC&tfa(gO<05zlBm|{c4;Rae14QX zEdNk+Tz{%rdcCN9?$n8-mv9&ro?a%ZV#RQUxI~}niE!s|IiBxuuUgXVPeWT%US~rb z)Ju~mq94935M2QKb^l)z4v)YI3mE@-<)B|q@R!X$tRtkZ^ml^4uWj-dEL;KXV*ATV zC%*%KU-aQmXctU<@*hh;{0{&7QxkuJ;ov%i{(%3NXDEJW`dv8xCl`|Ff05DuPVu|+ z@J|Y4=|3p`Dn$Gp{=4MnPq?!DAMn2kaDK=CJ|g-jeqZ@t#s7PJ^mp*@9f?1|r$&E( zf9qEK&hUGH`zJ%5$sY`V3wM9V|2>fS6AcIFWex}TA7RDs@V__ie}&Uo{RRF Date: Mon, 24 Nov 2025 14:46:58 -0800 Subject: [PATCH 03/13] moved legacy code --- 03_drug_biomarkers.Rmd | 1265 ---------------------------------------- 1 file changed, 1265 deletions(-) delete mode 100644 03_drug_biomarkers.Rmd diff --git a/03_drug_biomarkers.Rmd b/03_drug_biomarkers.Rmd deleted file mode 100644 index ae9d97c..0000000 --- a/03_drug_biomarkers.Rmd +++ /dev/null @@ -1,1265 +0,0 @@ ---- -title: "Evaluate drug and omics data for biomarker assessment" -author: "Sara gosline" -date: "2025-03-13" -output: html_document ---- - -This document is designed to be a working document where we can compare approaches to evaluate biomarkers of drug response across patient samples. We are collecting three types of data modalities: 1. RNA Sequencing 2. Global Proteomics 3. Phospho proteomics - -We also have drug sensitivity data (single dose viability, some curves) for many drugs. The question to ask is which molecules can predict drug response across patients? How robust/extendable is this? - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -library(synapser) -library(ggplot2) -library(dplyr) -library(tidyr) - -``` - -# Pull processed files from previous markdowns - -We have already run the previous scripts and stored data on Synapse - -```{r pull files} -Sys.setenv() - -source("cNF_helper_code.R") -traceback() - -source('cNF_helper_code.R') -##read in drug code -fits <- readr::read_tsv(synGet('syn69947322')$path) - - -##read in proteomic data -glong <- readr::read_csv(synGet('syn70078416')$path) -plong <- readr::read_csv(synGet('syn70078415')$path) - - -##read in transcrniptomic data -#TODO: process transcritpomic data into long format - - -``` - -## Format protein data to collect correlation values - -Do simple correlations to identify putative trends in the data. - -Get most efficacious, variable, and heatmap - -```{r} -# ensure an output folder - -outdir <- "figs" -if (!dir.exists(outdir)) dir.create(outdir, recursive = TRUE) - -shared <- intersect(fits$improve_sample_id, glong$Specimen) -message(sprintf("Found %d shared samples from %d drug experiments and %d proteomic experiments", -length(shared), length(unique(fits$improve_sample_id)), length(unique(glong$Specimen)))) - -glob_dat <- glong |> -ungroup() |> -subset(Specimen %in% shared) |> -dplyr::select(Specimen, Gene, correctedAbundance) |> -tidyr::pivot_wider( -names_from = "Gene", -values_from = "correctedAbundance", -values_fill = 0, -values_fn = mean -) |> -tibble::column_to_rownames("Specimen") - -phos_dat <- plong |> -ungroup() |> -subset(Specimen %in% shared) |> -dplyr::select(Specimen, site, correctedAbundance) |> -tidyr::pivot_wider( -names_from = "site", -values_from = "correctedAbundance", -values_fill = 0, -values_fn = mean -) |> -tibble::column_to_rownames("Specimen") - -## drug data to matrix here - -drug_dat <- fits |> -subset(dose_response_metric == "uM_viability") |> -dplyr::select(improve_sample_id, improve_drug_id, dose_response_value) |> -tidyr::pivot_wider( -names_from = "improve_drug_id", -values_from = "dose_response_value", -values_fn = mean -) |> -tibble::column_to_rownames("improve_sample_id") - -## summarize drugs - -drug_counts <- fits |> -subset(dose_response_metric == "uM_viability") |> -group_by(improve_drug_id) |> -distinct() |> -summarize( -meanResponse = mean(dose_response_value, na.rm = TRUE), -nMeasured = n_distinct(improve_sample_id), -variability = sd(dose_response_value, na.rm = TRUE), -.groups = "drop" -) - -# -------- Plot 1: most efficacious -------- - -p1 <- drug_counts |> -arrange(desc(meanResponse)) |> -subset(meanResponse < 0.5) |> -ggplot(aes(y = meanResponse, x = improve_drug_id, colour = nMeasured, size = variability)) + -geom_point() + -theme_minimal() + -theme( -axis.text.x = element_text(angle = 45, hjust = 1) # tilt x labels -) + -labs(title = "Most efficacious drugs", -y = "Mean cell viability (fraction)", -x = "Drug") - -ggsave(file.path("figs/most_efficacious.pdf"), p1, width = 12, height = 8, dpi = 300) - -# -------- Plot 2: most variable -------- - -p2 <- drug_counts |> -arrange(desc(variability)) |> -subset(variability > 0.15) |> -as.data.frame() |> -ggplot(aes(y = meanResponse, x = improve_drug_id, colour = nMeasured, size = variability)) + -geom_point() + -theme_minimal() + -theme( -axis.text.x = element_text(angle = 45, hjust = 1) # tilt x labels -) + -labs(title = "Most variable drugs", -y = "Mean cell viability (fraction)", -x = "Drug") - -ggsave(file.path("figs/most_variable.pdf"), p2, width = 12, height = 8, dpi = 300) - -# -------- Plot 3: heatmap of complete-measurement drugs -------- - -fulldrugs <- drug_counts |> -subset(nMeasured == nrow(drug_dat)) - -# Save large, rotate column labels, and shrink font to avoid overlap - -# pheatmap can write directly to a file via the `filename` arg. - -pheatmap::pheatmap( -as.matrix(drug_dat[, fulldrugs$improve_drug_id, drop = FALSE]), -filename = file.path(outdir, "drug_heatmap_large.pdf"), -width = 28, # inches: large so column labels are readable -height = 16, -dpi = 300, -angle_col = 45, # rotate column (drug) labels -fontsize_col = 6, # smaller font for many drugs -cluster_rows = TRUE, -cluster_cols = TRUE, -show_rownames = TRUE, -show_colnames = TRUE -) - -# Optional: also save a PDF (great for vector zooming) - -pheatmap::pheatmap( -as.matrix(drug_dat[, fulldrugs$improve_drug_id, drop = FALSE]), -filename = file.path("figs/drug_heatmap_large.pdf"), -width = 28, -height = 16, -angle_col = 45, -fontsize_col = 6, -cluster_rows = TRUE, -cluster_cols = TRUE, -show_rownames = TRUE, -show_colnames = TRUE -) - -# ---- PRINT plots in the document as well ---- - -print(p1) -print(p2) - -# Print the heatmap to the document (no filename draws it) - -pheatmap::pheatmap( -as.matrix(drug_dat[, fulldrugs$improve_drug_id, drop = FALSE]), -angle_col = 45, -fontsize_col = 6, -cluster_rows = TRUE, -cluster_cols = TRUE, -show_rownames = TRUE, -show_colnames = TRUE -) - -``` - - -# Basic correlation tests - -Can we simply find and rank proteins/psites/transcripts by correlation and do enrichment? - -We can define a simple test to correlate features and drugs and assess significance and correct: - -```{r correlation tests, warning=FALSE, error=FALSE, message = FALSE} - -#this function computes correlations between all columns for each drug/feature matrix, rows are the sample identifiers -#also coputes significance -computeCors <- function(drug_dat,feat_dat,shared){ - - cres <- cor(drug_dat[shared,],feat_dat[shared,],use='pairwise.complete.obs',method='spearman') |> - as.data.frame() |> - tibble::rownames_to_column('drug')|> - tidyr::pivot_longer(cols=2:(1+ncol(feat_dat)),names_to='gene',values_to='cor') |> - arrange(desc(cor)) - - ##now lets try to get significance - csig <- do.call(rbind,lapply(colnames(drug_dat),function(x){ - do.call(rbind,lapply(colnames(feat_dat),function(y){ - pval <- 1.0 - try(pval <- cor.test(drug_dat[shared,x], - feat_dat[shared,y], - use = 'pairwise.complete.obs', - method = 'spearman')$p.value,silent = TRUE) - - return(c(corp = pval,drug = x,gene = y)) - })) |> - as.data.frame() |> - mutate(fdr=p.adjust(unlist(corp),method='fdr')) - })) |> - as.data.frame() |> - mutate(drug = unlist(drug)) |> - mutate(gene = unlist(gene)) - - fullcors <- cres|>left_join(data.frame(csig)) |> - mutate(direction=ifelse(cor<0,'neg','pos')) - - return(fullcors) -} - -``` - -Now that we have a function we can compute correlations of each data type. - -```{r compute feature cors, warning=FALSE, error=FALSE, message = FALSE} - -gcor <- computeCors(drug_dat[,fulldrugs$improve_drug_id],glob_dat,shared) |> - mutate(data='proteins') -pcor <- computeCors(drug_dat[,fulldrugs$improve_drug_id],phos_dat,shared) |> - mutate(data = 'phosphosites') - -allcor <- rbind(gcor,pcor) - -corsummary<-allcor |> subset(fdr<0.25) |> - group_by(drug,data,direction) |> - summarize(features=n(),meanCor=mean(cor)) - - -p_features <- corsummary |> - subset(features > 1) |> - ggplot(aes(x = drug,y = features,fill = direction)) + - facet_grid(data~.) + - geom_bar(position='dodge',stat='identity') + - theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) - -ggsave(filename = file.path("figs/cor_features_by_drug.pdf"), - plot = p_features, width = 12, height = 6, units = "in") - -corsummary |> - arrange(desc(features)) |> - subset(features > 100) |> - dplyr::select(drug,data,direction,features,meanCor) - - - -``` - -Now we have the correlation values. what do we do with them? -## Correlation based enrichment -Do not run this everytime - it is extremely slow, so its setup to run once and save the data. The next steps load this data. -```{r functional enrichment} -# === Direction-aware leapR enrichment: run "top" (up) and "bottom" (down) separately === -# Requires: glob_dat (samples x proteins), phos_dat (samples x phosphosites), fits (drug responses) -# Outputs: -# prot_enrich[[drug]]$top / $bottom -# phos_enrich[[drug]]$top / $bottom -# Optional CSVs in folder "leapR_top_paths/dir_split/" - -library(dplyr) -library(tidyr) -library(stringr) -library(SummarizedExperiment) -library(leapR) - -# ---- choose drugs to run (use your two, or set to a larger list) ---- -# target_drugs <- c("THZ1", "Onalespib") -target_drugs <- unique(fits$improve_drug_id) - -# ---- genesets ---- -data(msigdb); geneset_db <- msigdb -data(kinasesubstrates) - -# ---- helpers ---- -extract_gene_from_site <- function(site_id) { - if (is.na(site_id) || site_id == "") return(NA_character_) - g <- str_split(as.character(site_id), "[:_\\-\\.]")[[1]][1] - toupper(stringr::str_extract(g, "^[A-Za-z0-9]+")) -} - -# Correlate response vector vs each feature column (Spearman) -col_spearman <- function(vec, mat) { - shared <- intersect(names(vec), rownames(mat)) - if (length(shared) < 3) return(setNames(rep(NA_real_, ncol(mat)), colnames(mat))) - v <- vec[shared] - m <- as.matrix(mat[shared, , drop = FALSE]) - apply(m, 2, function(col) { - if (all(is.na(col))) return(NA_real_) - if (sd(col, na.rm = TRUE) == 0 || sd(v, na.rm = TRUE) == 0) return(NA_real_) - suppressWarnings(cor(v, col, method = "spearman", use = "pairwise.complete.obs")) - }) -} - -# Build one-column SE; column name must match primary_columns -build_se_from_corvec <- function(cor_named_vec, features_all, col_label, map_to_gene = NULL, assay_label = "proteomics") { - v <- rep(NA_real_, length(features_all)); names(v) <- features_all - common <- intersect(names(cor_named_vec), features_all) - v[common] <- cor_named_vec[common] - mat <- matrix(v, nrow = length(v), ncol = 1, dimnames = list(features_all, col_label)) - rd <- DataFrame(feature_id = features_all) - rd$hgnc_id <- if (is.null(map_to_gene)) features_all else map_to_gene[features_all] - se <- SummarizedExperiment(assays = list(values = mat), rowData = rd, colData = DataFrame(sample = col_label)) - assayNames(se) <- assay_label - se -} - -safe_leapr <- function(...) tryCatch(leapR::leapR(...), error = function(e) { message("[leapR] ", conditionMessage(e)); NULL }) - -# ---- feature and mapping vectors ---- -prot_features <- colnames(glob_dat) -phos_features <- colnames(phos_dat) -phos_to_gene <- setNames(vapply(phos_features, extract_gene_from_site, FUN.VALUE = character(1)), - phos_features) - -# ---- results containers ---- -prot_enrich <- list() -phos_enrich <- list() - -# optional: write CSVs? -write_csvs <- TRUE -outdir <- "leapR_top_paths/dir_split" -if (write_csvs && !dir.exists(outdir)) dir.create(outdir, recursive = TRUE) - -# ---- per-drug workflow ---- -for (drug in target_drugs) { - message("=== ", drug, " ===") - # build response vector (mean per sample if repeats) - dv <- fits %>% - filter(improve_drug_id == !!drug, dose_response_metric == "uM_viability") %>% - group_by(improve_sample_id) %>% summarize(resp = mean(dose_response_value, na.rm = TRUE), .groups = "drop") - if (nrow(dv) == 0) { message(" no response rows; skipping"); next } - dv_vec <- setNames(dv$resp, dv$improve_sample_id) - - # correlations - prot_cor <- col_spearman(dv_vec, glob_dat) - phos_cor <- col_spearman(dv_vec, phos_dat) - - # split by sign - prot_pos <- prot_cor[!is.na(prot_cor) & prot_cor > 0] - prot_neg <- prot_cor[!is.na(prot_cor) & prot_cor < 0] - phos_pos <- phos_cor[!is.na(phos_cor) & phos_cor > 0] - phos_neg <- phos_cor[!is.na(phos_cor) & phos_cor < 0] - - # ---- global: TOP (positives as-is), BOTTOM (negatives flipped so they rank to the top) ---- - prot_enrich[[drug]] <- list(top = NULL, bottom = NULL) - - # TOP - if (length(prot_pos) >= 5) { - se_prot_top <- build_se_from_corvec(prot_pos, prot_features, col_label = paste0(drug, "_TOP"), assay_label = "proteomics") - prot_top <- safe_leapr(geneset = geneset_db, enrichment_method = "enrichment_in_order", - eset = se_prot_top, assay_name = "proteomics", - primary_columns = paste0(drug, "_TOP")) - prot_enrich[[drug]]$top <- prot_top - if (write_csvs && !is.null(prot_top)) { - write.csv(as.data.frame(prot_top), file = file.path(outdir, paste0(drug, "_global_TOP.csv"))) - } - } else message(" PROT top: too few positive features (", length(prot_pos), ")") - - # BOTTOM (flip sign so more negative = larger positive rank) - if (length(prot_neg) >= 5) { - se_prot_bot <- build_se_from_corvec(-prot_neg, prot_features, col_label = paste0(drug, "_BOTTOM"), assay_label = "proteomics") - prot_bot <- safe_leapr(geneset = geneset_db, enrichment_method = "enrichment_in_order", - eset = se_prot_bot, assay_name = "proteomics", - primary_columns = paste0(drug, "_BOTTOM")) - prot_enrich[[drug]]$bottom <- prot_bot - if (write_csvs && !is.null(prot_bot)) { - write.csv(as.data.frame(prot_bot), file = file.path(outdir, paste0(drug, "_global_BOTTOM.csv"))) - } - } else message(" PROT bottom: too few negative features (", length(prot_neg), ")") - - # ---- PHOSPHO: TOP/BOTTOM for pathways (gene mapping via hgnc_id) ---- - phos_enrich[[drug]] <- list(top = NULL, bottom = NULL) - - # TOP - if (length(phos_pos) >= 5) { - se_phos_top <- build_se_from_corvec(phos_pos, phos_features, col_label = paste0(drug, "_TOP"), - map_to_gene = phos_to_gene, assay_label = "phosphoproteomics") - phos_top <- safe_leapr(geneset = geneset_db, enrichment_method = "enrichment_in_order", - eset = se_phos_top, assay_name = "phosphoproteomics", - primary_columns = paste0(drug, "_TOP"), id_column = "hgnc_id") - phos_enrich[[drug]]$top <- phos_top - if (write_csvs && !is.null(phos_top)) { - write.csv(as.data.frame(phos_top), file = file.path(outdir, paste0(drug, "_phospho_TOP.csv"))) - } - } else message(" PHOS top: too few positive features (", length(phos_pos), ")") - - # BOTTOM (flip) - if (length(phos_neg) >= 5) { - se_phos_bot <- build_se_from_corvec(-phos_neg, phos_features, col_label = paste0(drug, "_BOTTOM"), - map_to_gene = phos_to_gene, assay_label = "phosphoproteomics") - phos_bot <- safe_leapr(geneset = geneset_db, enrichment_method = "enrichment_in_order", - eset = se_phos_bot, assay_name = "phosphoproteomics", - primary_columns = paste0(drug, "_BOTTOM"), id_column = "hgnc_id") - phos_enrich[[drug]]$bottom <- phos_bot - if (write_csvs && !is.null(phos_bot)) { - write.csv(as.data.frame(phos_bot), file = file.path(outdir, paste0(drug, "_phospho_BOTTOM.csv"))) - } - } else message(" PHOS bottom: too few negative features (", length(phos_neg), ")") - -} - -# Save all direction-split results for later reuse -save(prot_enrich, phos_enrich, file = "leapR_enrichment_direction_split.Rdata") - -message("Finished direction-aware enrichment. Results in lists prot_enrich / phos_enrich, and CSVs (if enabled).") - - -``` - - - -For each drug, how many terms do we see active? how many kinases? -```{r functional enrichment} -# ==== Load saved enrichment & build summaries (no list-casts, no count()) ==== -library(dplyr) -library(tidyr) -library(purrr) -library(tibble) -library(ggplot2) -library(forcats) -library(stringr) -library(scales) - -# Always load the precomputed enrichment lists here -load("leapR_enrichment_direction_split.Rdata") -if (!exists("prot_enrich")) stop("prot_enrich not found in leapR_enrichment_direction_split.Rdata") -if (!exists("phos_enrich")) stop("phos_enrich not found in leapR_enrichment_direction_split.Rdata") - -alpha <- 0.05 -topN <- 15 # <<< top 15 -dirs <- c("resistant","sensitive") - -# ---------- helpers ---------- -pick_pcol <- function(df) { - cols <- colnames(df) - if ("BH_pvalue" %in% cols) return(list(kind="adj", col="BH_pvalue")) - if ("SignedBH_pvalue" %in% cols) return(list(kind="adj", col="SignedBH_pvalue")) - if ("adj.P.Val" %in% cols) return(list(kind="adj", col="adj.P.Val")) - if ("padj" %in% cols) return(list(kind="adj", col="padj")) - if ("pvalue" %in% cols) return(list(kind="raw", col="pvalue")) - if ("P.Value" %in% cols) return(list(kind="raw", col="P.Value")) - NULL -} - -extract_term_col <- function(df) { - cands <- c("term","Term","pathway","Pathway","set","Set","geneset","gene_set","Category") - hit <- cands[cands %in% names(df)] - if (length(hit)) hit[[1]] else NULL -} - -tidy_one_result <- function(x) { - if (is.null(x)) return(tibble(pathway = character(), adj_p = numeric())) - df <- as.data.frame(x) - if (!nrow(df)) return(tibble(pathway = character(), adj_p = numeric())) - - term_col <- extract_term_col(df) - if (is.null(term_col)) { - df <- tibble::rownames_to_column(df, "pathway") - } else { - df <- dplyr::mutate(df, pathway = .data[[term_col]]) - } - df$pathway <- as.character(df$pathway) - - pk <- pick_pcol(df) - if (is.null(pk)) return(tibble(pathway = character(), adj_p = numeric())) - adj <- if (pk$kind == "adj") df[[pk$col]] else p.adjust(df[[pk$col]], method = "BH") - - tibble(pathway = df$pathway, adj_p = as.numeric(adj)) |> - filter(is.finite(adj_p), !is.na(adj_p)) -} - -flatten_by_direction <- function(lst, omic_label) { - if (!length(lst)) return(tibble()) - purrr::imap_dfr(lst, function(two, drug) { - bind_rows( - tidy_one_result(two$top) |> mutate(direction = "resistant"), - tidy_one_result(two$bottom) |> mutate(direction = "sensitive") - ) |> - mutate(drug = as.character(drug), omic = omic_label) - }) -} - -# ---------- long-format enrichment and significance filter ---------- -prot_long <- flatten_by_direction(prot_enrich, "global") -phos_long <- flatten_by_direction(phos_enrich, "phospho") -enrich_long <- bind_rows(prot_long, phos_long) |> as_tibble() -stopifnot(all(c("pathway","adj_p","direction","drug","omic") %in% names(enrich_long))) - -enrich_sig <- enrich_long |> - filter(is.finite(adj_p), !is.na(adj_p), adj_p < alpha) - -if (nrow(enrich_sig) == 0) { - message("No significant pathways at FDR < ", alpha, ".") - pathway_summary <- tibble() - drug_counts <- tibble() -} else { - pathway_summary <- enrich_sig |> - group_by(omic, direction, pathway) |> - summarise(n_drugs = n_distinct(drug), .groups = "drop") |> - arrange(desc(n_drugs)) - - all_drugs <- sort(unique(enrich_long$drug)) - drug_counts <- enrich_sig |> - group_by(drug, direction) |> - summarise(n_pathways = n_distinct(pathway), .groups = "drop") |> - complete(drug = all_drugs, direction = dirs, fill = list(n_pathways = 0L)) |> - arrange(drug, direction) - - # ---------- summary figures ---------- - dir.create("figs", showWarnings = FALSE) - - reorder_within <- function(x, by, within, sep = "___") { - x2 <- paste(x, within, sep = sep); stats::reorder(x2, by) - } - scale_y_reordered_wrap <- function(width = 32, sep = "___") { - ggplot2::scale_y_discrete( - labels = function(x) stringr::str_wrap(gsub(paste0(sep, ".*$"), "", x), width = width) - ) - } - - pathway_summary_top <- pathway_summary |> - group_by(omic, direction) |> - slice_max(order_by = n_drugs, n = topN, with_ties = FALSE) |> - ungroup() |> - mutate(pathway_in_omic = reorder_within(pathway, n_drugs, omic)) - - p_pathways <- ggplot(pathway_summary_top, - aes(y = pathway_in_omic, x = n_drugs, fill = direction)) + - geom_col(position = position_dodge(width = 0.85), width = 0.85) + - facet_wrap(~ omic, scales = "free_y") + - scale_y_reordered_wrap(width = 36) + - scale_x_continuous(expand = expansion(mult = c(0, 0.05))) + - labs(title = paste0("Top ", topN, " pathways enriched across drugs"), - y = "Pathway", x = "# Drugs") + - theme_minimal(base_size = 11) + - theme( - legend.position = "top", - strip.text = element_text(face = "bold"), - axis.text.y = element_text(size = 7), - panel.grid.major.x = element_blank(), - plot.margin = margin(5.5, 30, 5.5, 5.5) - ) + - coord_cartesian(clip = "off") - - # --- PDF saves (summary figs) --- - ggsave("figs/pathways_across_drugs_top15.pdf", p_pathways, - width = 12, height = 10, units = "in", device = cairo_pdf) - print(p_pathways) - - drug_counts_full <- drug_counts |> - group_by(drug) |> - mutate(total = sum(n_pathways)) |> - ungroup() |> - mutate(drug = forcats::fct_reorder(drug, total)) - - p_counts <- ggplot(drug_counts_full, aes(x = drug, y = n_pathways, fill = direction)) + - geom_col(position = position_dodge(width = 0.9), width = 0.85) + - labs(title = "Number of enriched pathways per drug", - x = "Drug", y = "# Pathways") + - theme_minimal(base_size = 11) + - theme( - legend.position = "top", - axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 6), - panel.grid.major.x = element_blank() - ) - - ggsave("figs/enriched_pathways_per_drug_wide.pdf", p_counts, - width = 18, height = 7, units = "in", device = cairo_pdf) - print(p_counts) -} - -# ---------- Top-2 most efficacious & most variable drugs ---------- -top2_efficacious <- character(0) -top2_variable <- character(0) - -if (exists("fits")) { - eff_tbl <- fits |> - filter(dose_response_metric == "uM_viability") |> - group_by(improve_drug_id) |> - summarise( - meanResponse = mean(dose_response_value, na.rm = TRUE), - variability = sd(dose_response_value, na.rm = TRUE), - nMeasured = dplyr::n_distinct(improve_sample_id), - .groups = "drop" - ) - top2_efficacious <- eff_tbl |> - arrange(meanResponse, desc(nMeasured)) |> - slice_head(n = 2) |> - pull(improve_drug_id) - top2_variable <- eff_tbl |> - arrange(desc(variability), desc(nMeasured)) |> - slice_head(n = 2) |> - pull(improve_drug_id) -} else { - some <- unique(enrich_long$drug) - top2_efficacious <- head(some, 2) - top2_variable <- head(rev(some), 2) -} - -# ---- Force-include Onalespib in top_interest (case-insensitive) ---- -ona_matches <- unique(enrich_long$drug[grepl("^onalespib$", enrich_long$drug, ignore.case = TRUE)]) -if (length(ona_matches) == 0) ona_matches <- "Onalespib" - -top_interest <- unique(c(top2_efficacious, top2_variable, ona_matches)) -message("Top-2 most efficacious (lowest mean viability): ", paste(top2_efficacious, collapse = ", ")) -message("Top-2 most variable (highest SD): ", paste(top2_variable, collapse = ", ")) -message("Force-included: ", paste(ona_matches, collapse = ", ")) - -# ---------- per-drug pathway barplots: always top 15, star-annotate significance ---------- -pick_pcol_plot <- function(df) { - cols <- colnames(df) - if ("BH_pvalue" %in% cols) return("BH_pvalue") - if ("SignedBH_pvalue" %in% cols) return("SignedBH_pvalue") - if ("adj.P.Val" %in% cols) return("adj.P.Val") - if ("padj" %in% cols) return("padj") - if ("pvalue" %in% cols) return("pvalue") - if ("P.Value" %in% cols) return("P.Value") - NA_character_ -} - -sig_stars <- function(p) dplyr::case_when( - is.na(p) ~ "", - p < 0.001 ~ "***", - p < 0.01 ~ "**", - p < 0.05 ~ "*", - TRUE ~ "" -) - -prep_plot_df <- function(res_df, n = 15) { - if (is.null(res_df)) return(NULL) - df <- as.data.frame(res_df); if (!nrow(df)) return(NULL) - if (!("feature" %in% names(df))) df <- tibble::rownames_to_column(df, "feature") - col <- pick_pcol_plot(df); if (is.na(col)) return(NULL) - - # Always compute BH adj p; then take top 15 by smallest adj_p (no significance filter) - adj_p <- if (col %in% c("pvalue","P.Value")) p.adjust(df[[col]], method = "BH") else df[[col]] - - df |> - mutate( - adj_p = as.numeric(adj_p), - score = -log10(pmax(adj_p, 1e-300)), - stars = sig_stars(adj_p), - signif = !is.na(adj_p) & adj_p < 0.05, - feature = stringr::str_wrap(as.character(feature), width = 40) - ) |> - filter(is.finite(adj_p)) |> - arrange(adj_p, desc(score)) |> - slice_head(n = n) -} - -plot_bar <- function(df_plot, title_text, label_type) { - if (is.null(df_plot) || !nrow(df_plot)) return(NULL) - lbl <- ifelse(label_type == "top", "Resistant pathways", "Sensitive pathways") - fillc <- ifelse(label_type == "top", "#D7191C", "#2C7BB6") - star_pad <- 0.15 - - ggplot(df_plot, aes(x = reorder(feature, score), y = score)) + - geom_col(aes(alpha = signif), fill = fillc, width = 0.85) + - scale_alpha_manual(values = c(`FALSE` = 0.5, `TRUE` = 1), guide = "none") + - geom_text(aes(y = score + star_pad, label = stars), - size = 3, hjust = 0) + - coord_flip(clip = "off") + - scale_y_continuous(expand = expansion(mult = c(0, 0.15))) + - labs(title = paste0(title_text, ", ", lbl), - x = NULL, y = expression(-log[10]("FDR"))) + - theme_minimal(base_size = 9) + - theme( - plot.title = element_text(size = 11), - axis.text.y = element_text(size = 7), - axis.text.x = element_text(size = 8), - plot.margin = margin(5.5, 30, 5.5, 5.5) - ) -} - -safelabel <- function(x) gsub("[^A-Za-z0-9_.-]", "_", x) - -dir.create("figs", showWarnings = FALSE) -for (drug in top_interest) { - pt <- if (drug %in% names(prot_enrich)) prot_enrich[[drug]]$top else NULL - pb <- if (drug %in% names(prot_enrich)) prot_enrich[[drug]]$bottom else NULL - ft <- if (drug %in% names(phos_enrich)) phos_enrich[[drug]]$top else NULL - fb <- if (drug %in% names(phos_enrich)) phos_enrich[[drug]]$bottom else NULL - - p1 <- plot_bar(prep_plot_df(pt, n = 15), paste0(drug, " — global"), "top") - p2 <- plot_bar(prep_plot_df(pb, n = 15), paste0(drug, " — global"), "bottom") - p3 <- plot_bar(prep_plot_df(ft, n = 15), paste0(drug, " — Phospho"), "top") - p4 <- plot_bar(prep_plot_df(fb, n = 15), paste0(drug, " — Phospho"), "bottom") - - if (!is.null(p1)) { - ggsave(file.path("figs", paste0("pathways_", safelabel(drug), "_global_resistant_top15.pdf")), - p1, width = 7, height = 5, device = cairo_pdf); print(p1) - } - if (!is.null(p2)) { - ggsave(file.path("figs", paste0("pathways_", safelabel(drug), "_global_sensitive_top15.pdf")), - p2, width = 7, height = 5, device = cairo_pdf); print(p2) - } - if (!is.null(p3)) { - ggsave(file.path("figs", paste0("pathways_", safelabel(drug), "_phospho_resistant_top15.pdf")), - p3, width = 7, height = 5, device = cairo_pdf); print(p3) - } - if (!is.null(p4)) { - ggsave(file.path("figs", paste0("pathways_", safelabel(drug), "_phospho_sensitive_top15.pdf")), - p4, width = 7, height = 5, device = cairo_pdf); print(p4) - } -} - - - -``` - - - - -# Print siginficant results for all drugs if we want. -```{r} -library(dplyr) -library(ggplot2) -library(tibble) -library(rlang) - -alpha <- 0.05 # significance threshold -top_n_to_show <- 15 - -# Prefer adjusted p if available; fall back to raw and adjust per-run -pick_pcol <- function(df) { - cols <- colnames(df) - if ("BH_pvalue" %in% cols) return(list(kind="adj", col="BH_pvalue")) - if ("SignedBH_pvalue" %in% cols) return(list(kind="adj", col="SignedBH_pvalue")) - if ("pvalue" %in% cols) return(list(kind="raw", col="pvalue")) - return(NULL) -} - -prep_plot_df <- function(res_df, n = top_n_to_show) { - if (is.null(res_df)) return(NULL) - df <- as.data.frame(res_df) - if (!nrow(df)) return(NULL) - - pick <- pick_pcol(df) - if (is.null(pick)) return(NULL) - - # unify to adj p - if (pick$kind == "adj") { - df <- df %>% mutate(adj_p = !!sym(pick$col)) - } else { # raw p → adjust within this run - df <- df %>% mutate(adj_p = p.adjust(!!sym(pick$col), method = "BH")) - } - - df %>% - rownames_to_column("feature") %>% - arrange(adj_p) %>% - # keep only significant ones; if none, return empty (caller will message) - filter(is.finite(adj_p), !is.na(adj_p), adj_p < alpha) %>% - head(n) %>% - mutate(score = -log10(pmax(adj_p, 1e-300))) -} - -plot_bar <- function(df_plot, title_text, label_type) { - if (is.null(df_plot) || !nrow(df_plot)) { - message(" No significant pathways for ", title_text, " (FDR<", alpha, ").") - return(NULL) - } - # Correct labels & colors for uM_viability convention: - # TOP -> resistant (red), BOTTOM -> sensitive (blue) - lbl <- ifelse(label_type == "top", "Resistant pathways", "Sensitive pathways") - fillc <- ifelse(label_type == "top", "#D7191C", "#2C7BB6") - - p <- ggplot(df_plot, aes(x = reorder(feature, score), y = score)) + - geom_col(fill = fillc) + - coord_flip() + - labs(title = paste0(title_text, ", ", lbl), - x = NULL, y = expression(-log[10]("FDR"))) + - theme_minimal(base_size = 9) + - theme( - plot.title = element_text(size = 11), - axis.text.y = element_text(size = 6), - axis.text.x = element_text(size = 8) - ) - print(p); invisible(p) -} - -plot_drug_panels <- function(drug) { - message("\n=== ", drug, " ===") - - # global - pt <- if (drug %in% names(prot_enrich)) prot_enrich[[drug]]$top else NULL - pb <- if (drug %in% names(prot_enrich)) prot_enrich[[drug]]$bottom else NULL - plot_bar(prep_plot_df(pt), paste0(drug, ", Global Proteomics"), "top") - plot_bar(prep_plot_df(pb), paste0(drug, ", Global Proteomics"), "bottom") - - # Phospho pathways - ft <- if (drug %in% names(phos_enrich)) phos_enrich[[drug]]$top else NULL - fb <- if (drug %in% names(phos_enrich)) phos_enrich[[drug]]$bottom else NULL - plot_bar(prep_plot_df(ft), paste0(drug, ", Phosphoproteomics"), "top") - plot_bar(prep_plot_df(fb), paste0(drug, ", Phosphoproteomics"), "bottom") -} - - -# --- run for your drugs --- -for (d in target_drugs) plot_drug_panels(d) - -``` - - - - -# Basic drug list -```{r} -drug_counts <- fits %>% - filter(dose_response_metric == "uM_viability") %>% - group_by(improve_drug_id) %>% - summarise( - n_rows = dplyr::n(), # total rows/measurements - n_specimens = n_distinct(improve_sample_id), # unique samples tested - meanResponse = mean(dose_response_value, na.rm = TRUE), - sdResponse = sd(dose_response_value, na.rm = TRUE), - .groups = "drop" - ) %>% - arrange(desc(n_specimens), improve_drug_id) - -# Plain list of drugs + total count -drug_list <- sort(unique(drug_counts$improve_drug_id)) -n_drugs <- length(drug_list) - -message(sprintf("Total unique drugs: %d", n_drugs)) -print((drug_list)) - -``` - - - - - - - - - - - - - - - - - - - - - -## Visualization -How should we visualize? Here is some older code -```{r plot cors, eval=FALSE} - -plotCors <- function(features,druglist,dataType='proteins'){ - ##subset a list of features and drugs and plot those in a graph - require(ggplot2) - if(dataType=='proteins'){ - ptab<-glong|>dplyr::rename(feature='Gene') - }else{ - ptab<-plong|>dplyr::rename(feature='site') - } - dtab<-fits|> - subset(dose_response_metric=='uM_viability')|> - dplyr::rename(Specimen='improve_sample_id',Drug='improve_drug_id')|> - subset(Drug%in%druglist) - - - ftab<-features|>left_join(ptab)|>left_join(dtab)|> - subset(!is.na(Drug)) - - feats <- unique(features$feature) - plots <- lapply(feats,function(x){ - corval <- ftab[ftab$feature==x,'cor'] - #corval <- ftab[ftab$feature==x,'pCor'] - - ftab|>subset(feature==x)|> - ggplot(aes(x=correctedAbundance,y=dose_response_value, - col=Patient,size=1))+ - geom_point()+ - facet_grid(~Drug)+ - ggtitle(paste(x,'Drug correlation'))+ - scale_color_manual(values=pcols) - }) - cowplot::plot_grid(plotlist= plots,ncol=2) - -} - -druglist<-c('Onalespib') -features<-subset(allcor,drug%in%druglist)|> - subset(fdr<0.25)|> - subset(abs(cor)>0.7)|> - subset(data=='proteins')|> - arrange(desc(abs(cor))) - -plotCors(rename(features[1:10,],feature='gene'),druglist) - -ggsave('onalespibFDR0.25Cors.pdf',height=20) - -``` - -# Random forest predictor - -Here we try to use random forest to extract predictive features. First we need to assess if the model can accurately predict drug response from the data. From those predictive models, we can extract features/biomarkers. - -First we build the data frames needed - I've included cohort as a covariate but may remove it. - -```{r random forest} -## separate out cohorts for prediction -cohorts <- meta |> - select(Specimen,cohort) |> - distinct() |> - tibble::column_to_rownames('Specimen') - -##for each drug, build model of cohort + protein ~ drug response -#removed cohort for now -gdf <- as.data.frame(glob_dat)#|>mutate(cohort=cohorts[rownames(glob_dat),'cohort']) -gnas <- which(apply(gdf,2,function(x) any(is.na(x)))) - -pdf <- as.data.frame(phos_dat)#|> mutate(cohort=cohorts[rownames(phos_dat),'cohort']) -pnas <- which(apply(pdf,2,function(x) any(is.na(x)))) - -mdf <- meta[-c(2,5),]|> ##have duplication here - tibble::column_to_rownames('Specimen') - - -``` - -Now we can loop through every drug, build model, and assess accuracy. - -```{r evaluate predictivty} - -#trying ou tthis function tos ee how it goes -rfFeatures <- function(drug_dat,fdf, mdf){ - complete_drugs <- which(apply(drug_dat,2, - function(x) length(which(!is.na(x)))==length(x))) - print(paste("Evaluating random forest for ",length(complete_drugs),'drugs')) - all_preds <- do.call(rbind,lapply(names(complete_drugs),function(drug){ - - dg <- fdf#[,-gnas] - - ##create the metadata df with the drug of interest - dmdf <- mdf |> - mutate(drug=drug_dat[rownames(mdf),drug]) - - rf <- randomForest::randomForest(x=dg, - y=drug_dat[rownames(dg),drug], - importance=TRUE,ntree=500) - - im <- randomForest::importance(rf)|> - as.data.frame() |> - mutate(drug=drug) - pord <- intersect(rownames(mdf)[order(drug_dat[rownames(mdf),drug])],rownames(glob_dat)) - - #pheatmap::pheatmap(t(glob_dat[pord, - # rownames(im)]),annotation_col=dmdf, - # cellheight=10,cluster_cols = TRUE) - return(im) - ##what do we return?x - })) - return(all_preds) -} - - -``` - -Now what do we do with the importance features? - -```{r rf processing} - -##get importance for global -gimp <- rfFeatures(drug_dat=drug_dat,fdf=gdf,mdf=mdf) - -##get importance for phospho -pimp <- rfFeatures(drug_dat=drug_dat,fdf=pdf,mdf=mdf) - -``` - - - - - - - - - - - - - - - - - -# Old correlation code, dont run - -now we can visualize correlations - -```{r check out HSP90s, eval=FALSE} - -hsps<-unique(glong$Genes[grep('^HSP',glong$Genes)]) - -cor_hsps<-subset(allcor,gene%in%hsps)|>subset(drug=='Onalespib')|>subset(corp<0.2) -#print(paste('measured',length(hsps),'HSPs in global data of which', nrow(cor_hsps),' are correlated with Onalespib')) - -plotCors(rename(cor_hsps,feature='gene'),c('Onalespib')) - -ggsave('hspCorsOna.pdf',height=nrow(cor_hsps)*3) - -hspp<-unique(plong$site[grep('^HSP',plong$site)]) -cor_hspps<-subset(allcor,gene%in%hspp)|>subset(drug=='Onalespib')|>subset(corp<0.2) -#print(paste('measured',length(hspp),'HSPs in phospho data of which',nrow(cor_hspps),' are correlated with Onalespib')) - -plotCors(rename(cor_hspps,feature='gene'),c('Onalespib'),'phospho') - -ggsave('hspPhosphoCorsOna.pdf',height=nrow(cor_hspps)*3) - - -``` - -### Now lets look only at IC50 values - -There are a few drugs for which we have IC50 values - -```{r check ic50 cors,warning=FALSE,error=FALSE, eval=FALSE} - -ifits<-subset(fits,dose_response_metric=='fit_ic50') - -shared<-intersect(ifits$improve_sample_id,glong$Specimen) -print(paste('Found',length(shared),'shared samples')) - -## a full join might be a challenge, maybe just take two matrices -drug_dat <- ifits|> - dplyr::select(improve_sample_id,improve_drug_id,dose_response_value)|> - tidyr::pivot_wider(names_from='improve_drug_id',values_from='dose_response_value',values_fn=mean)|> - tibble::column_to_rownames('improve_sample_id') - -gres<-cor(drug_dat[shared,],glob_dat[shared,],use='pairwise.complete.obs',method='pearson')|> - as.data.frame()|> - tibble::rownames_to_column('drug')|> - tidyr::pivot_longer(cols=2:(1+ncol(glob_dat)),names_to='gene',values_to='pCor')|> - arrange(desc(pCor)) - -##now lets try to get significance - -gsig<-do.call(rbind,lapply(colnames(drug_dat),function(x){ - do.call(rbind,lapply(colnames(glob_dat),function(y){ - pval<-1.0 - try(pval<-cor.test(drug_dat[shared,x],glob_dat[shared,y],use='pairwise.complete.obs',method='pearson')$p.value,silent=TRUE) - return(c(corp=pval,drug=x,gene=y)) - } - ))|>as.data.frame()|> - mutate(fdr=p.adjust(unlist(corp),method='fdr')) - }))|> - as.data.frame()|> - mutate(drug=unlist(drug))|> - mutate(gene=unlist(gene)) - -fullcors<-gres|>left_join(data.frame(gsig))|>mutate(data='global') - -pres<-cor(drug_dat[shared,],phos_dat[shared,],use='pairwise.complete.obs',method='pearson')|> - as.data.frame()|> - tibble::rownames_to_column('drug')|> - tidyr::pivot_longer(cols=2:(1+ncol(phos_dat)),names_to='site',values_to='pCor')|> - arrange(desc(pCor)) - -##now lets look at correlations - -psig<-do.call(rbind,lapply(colnames(drug_dat),function(x){ - do.call(rbind,lapply(colnames(phos_dat),function(y){ - pval<-1.0 - try(pval<-cor.test(drug_dat[shared,x],phos_dat[shared,y],use='pairwise.complete.obs',method='pearson')$p.value,silent=TRUE) - return(c(corp=pval,drug=x,gene=y)) - } - ))|>as.data.frame()|> - mutate(fdr=p.adjust(unlist(corp),method='fdr')) - }))|>as.data.frame() - -fullpcors<-pres|>rename(gene='site')|>left_join(data.frame(psig))|>mutate(data='phospho') - -#combine all correlations -allcor<-rbind(fullcors,fullpcors)|> - mutate(direction=ifelse(pCor<0,'neg','pos')) - - -##lets count the correlations and plot - -corsummary<-allcor|>subset(fdr<0.1)|> - group_by(drug,data,direction)|> - summarize(features=n(),meanCor=mean(pCor)) - -corsummary|> - #subset(features>1)|> - ggplot(aes(x=data,y=features,fill=drug))+ - facet_grid(~direction)+ - geom_bar(position='dodge',stat='identity') - - - -``` - -Again we have onalespib with numerous significantly correlated proteins, and one phosphosite for digoxin showing up . - -```{r plot individual sites, eval=FALSE} - -druglist<-c('Onalespib') -features<-subset(allcor,drug%in%druglist)|> - subset(fdr<0.05)|> - subset(data=='global') - -plotCors(rename(features,feature='gene'),druglist) - - -druglist<-c('Digoxin') -features<-subset(allcor,drug%in%druglist)|> - subset(fdr<0.1)|> - subset(data=='phospho') - -#plotCors(rename(features,feature='gene'),druglist,data='phospho') - - - -``` - -Now we can check the HSP proteins directly - -```{r HSP correlation, eval=FALSE} -hsps<-unique(glong$Genes[grep('^HSP',glong$Genes)]) - -cor_hsps<-subset(allcor,gene%in%hsps)|>subset(drug=='Onalespib')|>subset(corp<0.05) - -print(paste('measured',length(hsps),'HSPs in global data of which', nrow(cor_hsps),' are correlated with Onalespib')) - -plotCors(rename(cor_hsps,feature='gene'),'Onalespib') - -hspp<-unique(plong$site[grep('^HSP',plong$site)]) -cor_hspps<-subset(allcor,gene%in%hspp)|>subset(drug=='Onalespib')|>subset(corp<0.1) - -print(paste('measured',length(hspp),'HSPs in phospho data of which',nrow(cor_hspps),' are correlated with Onalespib')) -``` - -The IC50 result is similar to the viability. Now we can check AUC - -```{r auc hsp check, error=FALSE, warning=FALSE, eval=FALSE} - -ifits<-subset(fits,dose_response_metric=='auc') - -shared<-intersect(ifits$improve_sample_id,glong$Specimen) -print(paste('Found',length(shared),'shared samples')) - -## a full join might be a challenge, maybe just take two matrices -drug_dat <- ifits|> - dplyr::select(improve_sample_id,improve_drug_id,dose_response_value)|> - tidyr::pivot_wider(names_from='improve_drug_id',values_from='dose_response_value',values_fn=mean)|> - tibble::column_to_rownames('improve_sample_id') - -gres<-cor(drug_dat[shared,],glob_dat[shared,],use='pairwise.complete.obs',method='pearson')|> - as.data.frame()|> - tibble::rownames_to_column('drug')|> - tidyr::pivot_longer(cols=2:(1+ncol(glob_dat)),names_to='gene',values_to='pCor')|> - arrange(desc(pCor)) - -##now lets try to get significance - -gsig<-do.call(rbind,lapply(colnames(drug_dat),function(x){ - do.call(rbind,lapply(colnames(glob_dat),function(y){ - pval<-1.0 - try(pval<-cor.test(drug_dat[shared,x],glob_dat[shared,y],use='pairwise.complete.obs',method='pearson')$p.value,silent=TRUE) - return(c(corp=pval,drug=x,gene=y)) - } - ))|>as.data.frame()|> - mutate(bh_p=p.adjust(unlist(corp),method='BH')) - }))|> - as.data.frame()|> - mutate(drug=unlist(drug))|> - mutate(gene=unlist(gene)) - -fullcors<-gres|>left_join(data.frame(gsig))|>mutate(data='global') - -pres<-cor(drug_dat[shared,],phos_dat[shared,],use='pairwise.complete.obs',method='pearson')|> - as.data.frame()|> - tibble::rownames_to_column('drug')|> - tidyr::pivot_longer(cols=2:(1+ncol(phos_dat)),names_to='site',values_to='pCor')|> - arrange(desc(pCor)) - -##now lets look at correlations - -psig<-do.call(rbind,lapply(colnames(drug_dat),function(x){ - do.call(rbind,lapply(colnames(phos_dat),function(y){ - pval<-1.0 - try(pval<-cor.test(drug_dat[shared,x],phos_dat[shared,y],use='pairwise.complete.obs',method='pearson')$p.value,silent=TRUE) - return(c(corp=pval,drug=x,gene=y)) - } - ))|>as.data.frame()|> - mutate(bh_p=p.adjust(unlist(corp),method='BH')) - }))|>as.data.frame() - -fullpcors<-pres|>rename(gene='site')|>left_join(data.frame(psig))|>mutate(data='phospho') - -#combine all correlations -allcor<-rbind(fullcors,fullpcors)|> - mutate(direction=ifelse(pCor<0,'neg','pos')) - - -##lets count the correlations and plot - -corsummary<-allcor|>subset(bh_p<0.1)|> - group_by(drug,data,direction)|> - summarize(features=n(),meanCor=mean(pCor)) - -corsummary|> - #subset(features>1)|> - ggplot(aes(x=data,y=features,fill=drug))+facet_grid(~direction)+geom_bar(position='dodge',stat='identity') - -print(corsummary) - -``` - -```{r HSP correlation again, eval=FALSE} -hsps<-unique(glong$Genes[grep('^HSP',glong$Genes)]) - -cor_hsps<-subset(allcor,gene%in%hsps)|>s -ubset(drug=='Onalespib')|>subset(corp<0.05) -print(paste('measured',length(hsps),'HSPs in global data of which', nrow(cor_hsps),' are correlated with Onalespib')) -cor_hsps - -plotCors(rename(cor_hsps, feature='gene'),'Onalespib') - -hspp<-unique(plong$site[grep('^HSP',plong$site)]) -cor_hspps<-subset(allcor,gene%in%hspp)|>subset(drug=='Onalespib')|>subset(corp<0.1) -print(paste('measured',length(hspp),'HSPs in phospho data of which',nrow(cor_hspps),' are correlated with Onalespib')) -``` From 6196676b35795d68fa2ffcd1aa62db898d46faee Mon Sep 17 00:00:00 2001 From: Jeremy Date: Mon, 1 Dec 2025 15:20:09 -0800 Subject: [PATCH 04/13] Updated documentation for run_modality() in 02_run_normalize_omics.Rmd --- 02_run_normalize_omics.Rmd | 84 ++++++++++++++++---------------------- 1 file changed, 35 insertions(+), 49 deletions(-) diff --git a/02_run_normalize_omics.Rmd b/02_run_normalize_omics.Rmd index 5c852ec..bf77c93 100644 --- a/02_run_normalize_omics.Rmd +++ b/02_run_normalize_omics.Rmd @@ -26,66 +26,52 @@ source("02_normalize_batchcorrect_omics.R") # Run batch correction / normalization across phospho, global, and rna samples. ```{r} - # --------------------------------------------------------------------------- # run_modality() — quick reference for args & expected batch structure # --------------------------------------------------------------------------- # Args: -# modality : Character. One of "Global", "Phospho", or "rna". -# Controls feature-ID parsing and normalization rules. -# -# batches : List of per-cohort lists describing each input table. -# See “Batch spec” below. -# -# meta : Data frame with sample metadata. Must contain at least: -# - cohort (int or factor; matches batches[[i]]$cohort) -# - Specimen, Patient, Tumor (optional but used if present) -# For RNA, if aliquot joins fail, headers are matched to -# meta$Specimen using a normalized form. Generated by cNF_helper_code.R -# -# syn : An initialized synapser client (e.g., syn <- synapser::synLogin()). -# -# drop_name_substrings: Character vector of substrings. Any sample column whose -# name contains one of these (regex OR) is dropped. -# Use NULL or character(0) to keep all samples. -# -# out_dir : Directory where outputs/QC are written. Created if missing. # -# out_prefix : String used in filenames (e.g., "global", "phospho", "rna"). -# If save_basename is NULL, this is also used as the basename. +# - modality : "global" | "phospho" | "rna" (case-insensitive). +# - batches : list of cohort specs; each has: +# syn_id, cohort, fname_aliquot_index, (optional) value_start_col +# (auto-detects; fallback = 5 for global/phospho, 3 for rna). +# - meta : data.frame joined by (aliquot, cohort). Uses Specimen/Patient/Tumor +# if present. For RNA, falls back to normalized Specimen matching. +# - syn : synapser client (e.g., syn <- synLogin()). +# - drop_name_substrings : character vector; OR-regex to drop sample cols (NULL = keep all). +# - out_dir : output dir (auto-created). +# - out_prefix: filename stem (used unless save_basename set). +# - upload_parent_id : Synapse folder ID for uploads (NULL = no upload). +# - pcols : named colors for Patient in PCA (optional). +# - write_outputs : TRUE = write CSV/PDF (+upload if parent set); FALSE = in-memory only. +# - save_basename : override file stem (else out_prefix). +# - do_batch_correct : TRUE = ComBat by cohort; FALSE = skip (adds *_noBatchCorrect). # -# upload_parent_id : Synapse folder/entity ID to upload written files. -# If NULL, no uploads occur. -# -# pcols : Optional named color vector for plotting (e.g., by patient). -# If NULL, ggplot defaults are used. -# -# write_outputs : Logical. If TRUE, write CSVs/PDFs to out_dir (and upload -# when upload_parent_id is set). If FALSE, nothing is written/ -# uploaded; results are returned in-memory only. -# -# save_basename : Optional string to control the root of output filenames. -# If NULL, falls back to out_prefix. Useful when you want the -# directory name (out_dir) and the file basename to differ. -# -# do_batch_correct : Logical. If TRUE, applies ComBat across cohorts (batch-only). -# If FALSE, skips ComBat; filenames will include “_noBatchCorrect”. -# -# Returns: -# A list with: -# - se_batches : List of per-batch SummarizedExperiment objects (normalized). -# - se_combined : Combined SE after feature intersection. -# - se_corrected : Post-ComBat SE (or the same as combined if do_batch_correct=FALSE). -# - long_pre : Long-format data frame pre-ComBat (finite values only). -# - long_post : Long-format data frame post-ComBat (or pre if no ComBat). -# - pca_df_pre / pca_df_post : Data frames used for PCA scatter plots. -# - plots : ggplot objects for PCA & histograms. -# - files : Paths of any written files (if write_outputs=TRUE). +# Per-modality normalization +# - phospho : 0→NA → drop >50% missing → log2(x+0.01) → per-sample modified z. +# - global : log2(x) → per-sample modified z. +# - rna : drop >50% missing → log2(TPM+1) → per-sample modified z. # +# Returns (list) +# - se_batches : per-batch normalized SEs. +# - se_combined: intersection-features combined SE. +# - se_corrected: post-ComBat SE (NULL if do_batch_correct=FALSE). +# - se_post : SE used “post” (se_corrected or se_combined). +# - did_combat: TRUE/FALSE. +# - long_pre : long table from se_combined (finite only). +# - long_post : long table from se_post. +# - pca_df_pre, pca_df_post : PCA inputs (complete-case features). +# - plots : pre_pca, pre_hist, pca, hist (ggplot). +# - files : if write_outputs=TRUE, $queued = written file paths. # +# Notes +# - ComBat drops samples with NA cohort; requires colData(se)$cohort. +# - Global: splits multi-symbol Genes by ';'. RNA aliquot may be NA (Specimen fallback). +# - Sample headers auto-detected if path-like (*.raw, *.mzML, paths). # --------------------------------------------------------------------------- + # Substrings to drop (These were the protocol optimization samples) drop_subs <- c( "cNF_organoid_DIA_G_02_11Feb25", From 56f25659577aa8c05857220a741fb5d6497a11e8 Mon Sep 17 00:00:00 2001 From: Jeremy Date: Wed, 3 Dec 2025 10:04:23 -0800 Subject: [PATCH 05/13] cleaned up comments --- 02_normalize_batchcorrect_omics.R | 26 ++++++++++++-------------- 03_analyze_modality_correlations.R | 20 ++++++-------------- 2 files changed, 18 insertions(+), 28 deletions(-) diff --git a/02_normalize_batchcorrect_omics.R b/02_normalize_batchcorrect_omics.R index a5dad15..5fe3611 100644 --- a/02_normalize_batchcorrect_omics.R +++ b/02_normalize_batchcorrect_omics.R @@ -17,7 +17,6 @@ suppressPackageStartupMessages({ }) # Small helpers - modified_zscore <- function(x, na.rm = TRUE) { m <- suppressWarnings(stats::median(x, na.rm = na.rm)) md <- suppressWarnings(stats::mad(x, constant = 1, na.rm = na.rm)) @@ -55,7 +54,6 @@ make_dropper <- function(substrings) { } # Functions to clean up irregular names - basename_only <- function(x) sub("^.*[\\\\/]", "", x) basename_no_ext <- function(x) sub("\\.[^.]+$", "", basename_only(x)) @@ -121,7 +119,6 @@ parse_rna_header_triplet <- function(fnames) { } # Functions to get data from Synapse - read_wide_from_synapse <- function(syn, syn_id) { message(" Reading Synapse file: ", syn_id) df <- read.table( @@ -179,8 +176,9 @@ parse_fnames <- function(fnames, aliquot_field_index, cohort) { out } -# -------------------------- Feature ID builders ------------------------------ - +##### +#Feature ID builders +##### build_phospho_ids <- function(df) { lsite <- tolower(df$Residue) paste0(df$`Gene.Names`, "-", df$Residue, df$Site, lsite) @@ -587,9 +585,9 @@ perform_uploads <- function(paths, syn, parent_id) { message("Uploads complete.") } - - -# ------------------------------- Main entry ---------------------------------- +##### +# Main entry +##### # This is how we call the function / pipeline run_modality <- function( @@ -672,7 +670,7 @@ run_modality <- function( } } - # ---- Combine (INTERSECTION) & pre-QC --------------------------------------- + # Combine & pre-QC message("--------------------------------------------------") se_combined <- combine_batches_intersection(se_list) @@ -688,7 +686,7 @@ run_modality <- function( upload_queue <- c(upload_queue, pre_pca_pdf, pre_hist_pdf) } - # ---- Optional ComBat (batch-only) ------------------------------------------ + # ComBat if (isTRUE(do_batch_correct)) { message("--------------------------------------------------") se_post <- combat_by_cohort(se_combined) @@ -702,7 +700,7 @@ run_modality <- function( post_title <- paste0("Combined ", modality, " samples (no ComBat)") } - # ---- Exports ---------------------------------------------------------------- + # Exports message(" Building long tables") long_pre <- se_to_long(se_combined, modality) |> dplyr::filter(is.finite(correctedAbundance)) @@ -716,7 +714,7 @@ run_modality <- function( upload_queue <- c(upload_queue, path_pre, path_post) } - # ---- Post (either ComBat or not) QC ---------------------------------------- + # Post ComBat/QC message(" Post-QC plots (PCA & histogram)") pc_df <- pca_df_present_in_all(se_post) gpca <- plot_pca(pc_df, post_title, pcols = pcols) @@ -729,7 +727,7 @@ run_modality <- function( upload_queue <- c(upload_queue, post_pca_pdf, post_hist_pdf) } - # ---- Pack results for return ------------------------------------------------ + # Pack results for return results <- list( se_batches = se_list, se_combined = se_combined, @@ -744,7 +742,7 @@ run_modality <- function( files = if (write_outputs) list(queued = upload_queue) else list() ) - # ---- FINAL STEP: Uploads ---------------------------------------------------- + # FINAL STEP: Uploads if (write_outputs && !is.null(upload_parent_id)) { perform_uploads(upload_queue, syn, upload_parent_id) } else if (!write_outputs) { diff --git a/03_analyze_modality_correlations.R b/03_analyze_modality_correlations.R index 10fa56e..0ec90c1 100644 --- a/03_analyze_modality_correlations.R +++ b/03_analyze_modality_correlations.R @@ -13,12 +13,9 @@ dir.create("figs", showWarnings = FALSE) # Helpers make_feature_matrix <- function(df_long, shared_ids, sample_col, feature_col, value_col) { - # Strict cleaning: trim to character, drop NA/blank IDs, then pivot df <- df_long %>% ungroup() %>% - # keep only shared sample IDs first (as before) dplyr::filter(.data[[sample_col]] %in% shared_ids) %>% - # coerce and trim IDs mutate( !!sample_col := trimws(as.character(.data[[sample_col]])), !!feature_col := trimws(as.character(.data[[feature_col]])) @@ -27,8 +24,6 @@ make_feature_matrix <- function(df_long, shared_ids, sample_col, feature_col, va # Drop rows with NA/blank sample/feature IDs bad_sample <- is.na(df[[sample_col]]) | df[[sample_col]] == "" bad_feature <- is.na(df[[feature_col]]) | df[[feature_col]] == "" - if (any(bad_sample)) message("[make_feature_matrix] Dropping ", sum(bad_sample), " rows with NA/blank ", sample_col) - if (any(bad_feature)) message("[make_feature_matrix] Dropping ", sum(bad_feature), " rows with NA/blank ", feature_col) df <- df[!(bad_sample | bad_feature), , drop = FALSE] if (!nrow(df)) { @@ -48,7 +43,6 @@ make_feature_matrix <- function(df_long, shared_ids, sample_col, feature_col, va ) %>% as.data.frame(check.names = FALSE) - # Guard against NA/blank rownames after pivot rn <- wide[[sample_col]] bad_rn <- is.na(rn) | rn == "" if (any(bad_rn)) { @@ -63,7 +57,7 @@ make_feature_matrix <- function(df_long, shared_ids, sample_col, feature_col, va return(out) } - # Finalize rownames (must be unique, non-empty) + # Finalize rownames rownames(wide) <- make.unique(as.character(rn), sep = "_dup") wide[[sample_col]] <- NULL wide @@ -200,22 +194,20 @@ summarize_correlated_features <- function(cor_tbl, fdr_thresh = 0.25, outdir = " # --------------------------- # Main wrapper # --------------------------- -# Returns: list(drug_mat, feat_mat, shared_ids, cor_tbl, cor_summary, cor_plot, drug_summary) analyze_modality <- function( fits, df_long, sample_col, # e.g., "Specimen" feature_col, # e.g., "feature_id" | "Gene" | "site" value_col, # e.g., "correctedAbundance" - metric = "uM_viability", + metric = "uM_viability", # Or fit_auc outdir = "figs", heatmap_filename = "drug_heatmap_large.pdf", fdr_thresh = 0.25 ) { - # Pre-intersection like original (all metrics) + shared_ids <- base::intersect(unique(fits$improve_sample_id), unique(df_long[[sample_col]])) - # Feature matrix over shared IDs feat_mat <- make_feature_matrix( df_long = df_long, shared_ids = shared_ids, @@ -224,7 +216,7 @@ analyze_modality <- function( value_col = value_col ) - # Drug matrix for the exact metric (no normalization/fallbacks) + # Drug matrix for the metric drug_mat <- make_drug_matrix( fits = fits, metric = metric, @@ -234,7 +226,7 @@ analyze_modality <- function( metric_col = "dose_response_metric" ) - # Summaries & heatmap (original style) + # Summaries & heatmap dsum <- summarize_drugs( fits, metric = metric, metric_col = "dose_response_metric", outdir = outdir ) @@ -257,7 +249,7 @@ analyze_modality <- function( } } - # Correlations (only if overlap) + # Correlations shared_after <- base::intersect(rownames(drug_mat), rownames(feat_mat)) cor_tbl <- if (length(shared_after) > 0L) { compute_cors(drug_mat, feat_mat, shared_samples = shared_after) From 020b9088c1acb86a1b4ebfc2b6623f45420dbe69 Mon Sep 17 00:00:00 2001 From: Jeremy Date: Fri, 5 Dec 2025 09:14:19 -0800 Subject: [PATCH 06/13] Added README to clarify run order. Moved all old code to leagcy code directory --- 02_run_normalize_omics.Rmd | 13 +- README.md | 13 + cNF_helper_code.R | 3 +- .../02_normalize_harmonize_proteomics.rmd | 527 +++++++ .../03_analyze_modality_correlations.R | 0 legacy_code/03_drug_biomarkers.Rmd | 1261 +++++++++++++++++ .../03_drug_biomarkers_legacy_code.Rmd | 0 7 files changed, 1809 insertions(+), 8 deletions(-) create mode 100644 legacy_code/02_normalize_harmonize_proteomics.rmd rename 03_analyze_modality_correlations.R => legacy_code/03_analyze_modality_correlations.R (100%) create mode 100644 legacy_code/03_drug_biomarkers.Rmd rename 03_drug_biomarkers_legacy_code.Rmd => legacy_code/03_drug_biomarkers_legacy_code.Rmd (100%) diff --git a/02_run_normalize_omics.Rmd b/02_run_normalize_omics.Rmd index bf77c93..1dbe95e 100644 --- a/02_run_normalize_omics.Rmd +++ b/02_run_normalize_omics.Rmd @@ -48,9 +48,9 @@ source("02_normalize_batchcorrect_omics.R") # - do_batch_correct : TRUE = ComBat by cohort; FALSE = skip (adds *_noBatchCorrect). # # Per-modality normalization -# - phospho : 0→NA → drop >50% missing → log2(x+0.01) → per-sample modified z. -# - global : log2(x) → per-sample modified z. -# - rna : drop >50% missing → log2(TPM+1) → per-sample modified z. +# - phospho : 0 toNA to drop >50% missing to log2(x+0.01) to per-sample modified z. +# - global : log2(x) to per-sample modified z. +# - rna : drop >50% missing to log2(TPM+1) to per-sample modified z. # # Returns (list) # - se_batches : per-batch normalized SEs. @@ -100,7 +100,7 @@ phospho <- run_modality( upload_parent_id = "syn70078365", pcols = pcols, write_outputs = FALSE, - save_basename = "Phospho_batch_corrected", + save_basename = "phospho_batch12_corrected", do_batch_correct = TRUE ) @@ -121,7 +121,7 @@ global <- run_modality( upload_parent_id = "syn70078365", pcols = pcols, write_outputs = FALSE, - save_basename = "Global_batch_corrected", + save_basename = "global_batch12_corrected", do_batch_correct = TRUE ) @@ -143,9 +143,8 @@ rna <- run_modality( upload_parent_id = "syn71099587", pcols = pcols, write_outputs = FALSE, - save_basename = "RNA_Matrix_no_batch_correct", + save_basename = "RNA_12_no_batch_correct", do_batch_correct = FALSE #Note this is set to false right now. Not needed for RNA ) - ``` diff --git a/README.md b/README.md index 7c6c957..69f7152 100644 --- a/README.md +++ b/README.md @@ -6,3 +6,16 @@ The data for this code is hosted on Synapse at http://synapse.org/cnfDrugRespons ## Current analysis We are collecting omics measurements from cNF organoid samples together with drug response data to identify potential biomarkers of drug response. + +## Quick run order (and what each file sources) +1) **01_harmonize_drug_data.Rmd** + - *Sources:* `cNF_helper_code.R` + - Download & quick QC of drug fits. + +2) **02_run_normalize_omics.Rmd** + - *Sources:* `cNF_helper_code.R`, `02_normalize_batchcorrect_omics.R` + - Runs per-modality normalization and ComBat; writes long tables/plots if enabled. + +3) **04_analyze_modality_and_pathway_enrich.Rmd** + - *Sources:* `cNF_helper_code.R`, `03_analyze_modality_correlations.R`, `04_leapr_biomarker.R` + - Builds drug/feature matrices, modality correlations, leapR pathway enrichment and plots for all. diff --git a/cNF_helper_code.R b/cNF_helper_code.R index 711858a..9ae26c5 100644 --- a/cNF_helper_code.R +++ b/cNF_helper_code.R @@ -6,7 +6,8 @@ library(synapser) synLogin() syn <- list(get = synapser::synGet, store = synapser::synStore) library(readxl) - +library(tidyr) +library(dplyr) meta1 <- readxl::read_xlsx(syn$get('syn65595365')$path) |> tidyr::separate(Specimen,into=c('Patient','Tumor'),sep='_',remove = FALSE)|> diff --git a/legacy_code/02_normalize_harmonize_proteomics.rmd b/legacy_code/02_normalize_harmonize_proteomics.rmd new file mode 100644 index 0000000..aa2f322 --- /dev/null +++ b/legacy_code/02_normalize_harmonize_proteomics.rmd @@ -0,0 +1,527 @@ +--- +title: "Reformat and process proteomics" +author: "Sara Gosline" +date: "2025-09-28" +output: html_document +--- + + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +library(ggplot2) +library(dplyr) +library(tidyr) +library(stringr) +library(synapser) +library(grid) +source('cNF_helper_code.R') +``` + +# Drop sample list. These were for protocol optimization +```{r} +remove_fname_substrings <- c( + "cNF_organoid_DIA_G_02_11Feb25", + "cNF_organoid_DIA_G_05_11Feb25", + "cNF_organoid_DIA_G_06_11Feb25", + "cNF_organoid_DIA_P_02_29Jan25", + "cNF_organoid_DIA_P_05_11Feb25", + "cNF_organoid_DIA_P_06_11Feb25" +) + +is_unwanted_fname <- function(x) { + vapply(x, function(s) + any(vapply(remove_fname_substrings, function(p) grepl(p, s, fixed = TRUE), logical(1))), + logical(1) + ) +} + + +``` + + + +## Normalize phospho-proteomics + +We now have phosphoproteomics from two cohorts. Here I'm trying to collect data from both and normalize but am clearly missing something. I do the following: + +1. replace all zero values with NA to avoid skewing normalization +2. remove any features that are absent from >50% of the samples +3. take the log of the data, then take a modified z score + +Each dataset is done individually then combined at the end. There is a clear batch effect. + +### Cohort 1 phospho + +We start with the cohort 1 phospho data here. + +```{r compare to proteomics,warning=FALSE} + +##cohort 1 phospho +##first we read in file, and get site info +phospho1<- read.table(syn$get('syn69963552')$path,sep='\t',fill=NA,header=T,quote='"') |> + subset(!is.na(`Gene.Names`)) |> + subset(Gene.Names!='') |> + mutate(lsite=tolower(Residue)) |> + tidyr::unite(c(Residue,Site,lsite),col='site',sep='') |> + tidyr::unite(c(`Gene.Names`,site),col='site',sep='-') |> + as.data.frame() + +phospho1[which(phospho1==0,arr.ind=TRUE)]<-NA + +pfnames1 <- data.frame(fname=colnames(phospho1)[5:ncol(phospho1)])|> + mutate(aliquot=sapply(fname,function(x) unlist(strsplit(x,split='_'))[8]))|> + mutate(aliquot=as.double(aliquot))|> + mutate(cohort=1) |> + dplyr::filter(!is_unwanted_fname(fname)) + + + +##logtransform##median transform +pzeros<-which(apply(phospho1[,5:ncol(phospho1)],1,function(x) + length(which(is.na(x)))/length(x) < 0.5)) + +pmat1<-apply(0.01+log2(phospho1[pzeros,5:ncol(phospho1)]),2, + function(x) {0.6745 *(x-median(x,na.rm=T))/mad(x,na.rm=T)}) |> + as.data.frame() |> + mutate(site=phospho1$site[pzeros]) + +##move to long form, upload +plong1<-pmat1|> + tidyr::pivot_longer(1:(ncol(pmat1)-1),names_to='fname',values_to='abundance')|> + left_join(pfnames1) |> + group_by(site,fname,aliquot,cohort) |> + summarize(meanAbundance=mean(abundance,na.rm=T))|> + subset(!is.na(meanAbundance))|> + left_join(meta) + +readr::write_csv(plong1,file='log2normMedCenteredPhospho.csv') +syn$store(File('log2normMedCenteredPhospho.csv',parentId='syn70078365')) + +``` +The file is uploaded to synapse. + +### Cohort 2 phospho + +Now on October 7 we can process the second batch of phospho. + +```{r cohort 2 phospho} + +##cohort 2 phospho +##1 read in data +phospho2 <- read.table(syn$get('syn69947351')$path,sep='\t',fill=NA,header=T,quote='"') |> + subset(!is.na(`Gene.Names`)) |> + subset(Gene.Names != '') |> + mutate(lsite = tolower(Residue)) |> + tidyr::unite(c(Residue,Site,lsite),col = 'site',sep='') |> + tidyr::unite(c(`Gene.Names`,site),col = 'site',sep='-') + + +phospho2[which(phospho2==0,arr.ind=TRUE)] <- NA + +pfnames2 <- data.frame(fname=colnames(phospho2)[5:ncol(phospho2)]) |> + mutate(aliquot=sapply(fname,function(x) unlist(strsplit(x,split='_'))[9])) |> + mutate(aliquot=as.double(aliquot))|> + mutate(cohort=2)|> + dplyr::filter(!is_unwanted_fname(fname)) + +##remove missingness +tm <- which(apply(phospho2[,5:ncol(phospho2)],1,function(x) length(which(is.na(x)))/length(x) < 0.5)) + +##log2 adjusted z score +pmat2<-apply(log2(0.01+phospho2[tm,5:ncol(phospho2)]),2, + function(x) {0.6745 *(x-median(x,na.rm=T))/mad(x,na.rm=T)}) |> + as.data.frame() |> + mutate(site=phospho2$site[tm]) + + + +plong2<-pmat2|> + tidyr::pivot_longer(1:(ncol(pmat2)-1),names_to='fname',values_to='abundance') |> + left_join(pfnames2)|> + group_by(site,fname,aliquot,cohort) |> + summarize(meanAbundance=mean(abundance,na.rm=T)) |> + subset(!is.na(meanAbundance))|> + left_join(meta) + +##save to file +readr::write_csv(plong2,file='log2normMedCenteredPhospho_cohort2.csv') +syn$store(File('log2normMedCenteredPhospho_cohort2.csv',parentId='syn70078365')) + + +``` + +Now that we have two cohorts we can try to combine without batch correction. + +### Combined phospho +Combining the phoshpo data here. + +```{r combined phospho} + +##now we move back to long form +# plong <- rbind(plong1,plong2) + +plong1 <- plong1 |> dplyr::filter(!is_unwanted_fname(fname)) +plong2 <- plong2 |> dplyr::filter(!is_unwanted_fname(fname)) +plong <- rbind(plong1, plong2) |> dplyr::filter(!is_unwanted_fname(fname)) + + #pmat |> +# as.data.frame()|> +# tibble::rownames_to_column('site')|> +# pivot_longer(-site,names_to='fname',values_to='abundance')|> +# left_join(rbind(pfnames1,pfnames2))|> +# group_by(site,fname,aliquot,cohort) |> +# summarize(meanAbundance=mean(abundance,na.rm=T)) |> +# left_join(meta) + + +compsites <- plong|> +# subset(meanAbundance>(-5))|> + group_by(site)|> + summarize(spec = n_distinct(Specimen))|> + subset(spec==31) + +#plong$meanAbundance[which(!is.finite(plong$meanAbundance))]<-0 + +ppcs<-plong|>ungroup()|> + dplyr::select(Specimen,meanAbundance,site)|> + unique()|> + subset(site%in%compsites$site)|> + #subset(!is.na(site))|> + #subset(!is.na(meanAbundance))|> + tidyr::pivot_wider(names_from='Specimen',values_from='meanAbundance', + values_fn=mean,values_fill=0)|> + tibble::column_to_rownames('site')|> + t()|> + prcomp() + +pplot<-ppcs$x|> + as.data.frame()|> + dplyr::select(PC1,PC2,PC3)|> + tibble::rownames_to_column('Specimen')|> + left_join(meta)|> + dplyr::select(PC2,PC1,Specimen,Patient,cohort)|> + mutate(cohort=as.factor(cohort))|> + distinct()|> + ggplot(aes(x=PC1,y=PC2,label=Specimen,col=Patient,shape=cohort))+ + geom_point()+ + #ggrepel::geom_label_repel()+ + ggtitle("Phospho samples")+ + ggplot2::scale_color_manual(values=pcols) + +ph<- plong |>ungroup()|> + subset(site%in%compsites$site)|> + ggplot(aes(x=meanAbundance,fill=as.factor(cohort)))+geom_histogram() + +cowplot::plot_grid(ph,pplot) +ggsave('cNFPhosphoQC.png',width=10) + +pplot +ggsave('phosphoPCA.pdf') +``` + +Clearly there is a strong batch effect. +## Normalize golbal proteomics +Now we can move onto the global data + +### Cohort 1 global +Global proteomics in cohort 1 here. + +```{r global} +####now process global +#global1<-readr::read_tsv(syn$get('syn64906445')$path) +global1 <- read.table(syn$get('syn69947355')$path,sep='\t',header=T,quote='"') |> + tidyr::separate_rows(Genes,sep=';') +##logtransform, medina transform + +#global1[which(global1==0,arr.ind=TRUE)]<-NA + +gmat1<-apply(log2(global1[,5:ncol(global1)]),2,function(x) 0.6745 *(x-median(x,na.rm=T))/mad(x,na.rm=T)) + +gmat1<-gmat1|> + as.data.frame()|> + mutate(Genes=global1$Genes) + +##extract aliquot info from file name +gfnames1 <- data.frame(fname=colnames(global1)[5:ncol(global1)]) |> + mutate(aliquot=sapply(fname,function(x) unlist(strsplit(x,split='_'))[6])) |> + mutate(aliquot=as.double(aliquot))|> + mutate(cohort=1)|> + dplyr::filter(!is_unwanted_fname(fname)) + +glong1<-gmat1|> + tidyr::pivot_longer(1:(ncol(gmat1)-1),names_to='fname',values_to='abundance')|> + left_join(gfnames1)|> + group_by(Genes,fname,aliquot,cohort)|> + summarize(meanAbundance=mean(abundance))|> + subset(is.finite(meanAbundance))|> + left_join(meta) + + +readr::write_csv(glong1,file='log2normMedCenteredGlobal.csv') +syn$store(File('log2normMedCenteredGlobal.csv',parentId='syn70078365')) +``` + +### Cohort 2 global +October 7 we process the second cohort. +```{r batch 2 global} +global2<-read.table(syn$get('syn69947352')$path,header=T,sep='\t',quote='"')|> + tidyr::separate_rows(Genes,sep=';') + +#global2[which(global2==0,arr.ind=TRUE)]<-NA + +gmat2<-apply(log2(global2[,5:ncol(global2)]),2,function(x) + 0.6745 *(x-median(x,na.rm=T))/mad(x,na.rm=T)) +rownames(gmat2)<-global2$Genes + +gmat2<-gmat2|> + as.data.frame()|> + mutate(Genes=global2$Genes) + +gfnames2 <- data.frame(fname=colnames(global2)[5:ncol(global2)]) |> + mutate(aliquot=sapply(fname,function(x) unlist(strsplit(x,split='_'))[7])) |> + mutate(aliquot=as.double(aliquot))|> + mutate(cohort=2)|> + dplyr::filter(!is_unwanted_fname(fname)) +gfnames1 + +glong2<-gmat2|> + tidyr::pivot_longer(1:(ncol(gmat2)-1),names_to='fname',values_to='abundance')|> + left_join(gfnames2)|> + group_by(Genes,fname,aliquot,cohort)|> + summarize(meanAbundance=mean(abundance))|> + subset(is.finite(meanAbundance))|> + left_join(meta) + +#dupes<-global|>group_by(Genes)|>summarize(numIso=n())|> +# subset(numIso>1) + + +readr::write_csv(glong2,file='log2normMedCenteredGlobal_cohort2.csv') +syn$store(File('log2normMedCenteredGlobal_cohort2.csv',parentId='syn70078365')) +``` + +### Global combined without batch correction +Now we can combine the global withot batch correction. + +```{r combined global test} +#ma<-mean(glong$abundance,na.rm=T) +#glong$meanAbundance[which(!is.finite(glong$meanAbundance))]<-0 +glong1 <- glong1 |> dplyr::filter(!is_unwanted_fname(fname)) +glong2 <- glong2 |> dplyr::filter(!is_unwanted_fname(fname)) +glong <- rbind(glong1, glong2) |> dplyr::filter(!is_unwanted_fname(fname)) |> + subset(Genes!="") + +n_spec <- meta |> dplyr::distinct(Specimen) |> nrow() +compsites <- glong|> +# subset(meanAbundance>(-5))|> + group_by(Genes)|> + summarize(spec = n_distinct(Specimen))|> + subset(spec==n_spec) + +gpcs<-glong|>ungroup()|> + dplyr::select(Specimen,meanAbundance,Genes)|> + subset(!is.na(Genes))|> + subset(Genes!="")|> + subset(Genes%in%compsites$Genes)|> + subset(!is.na(meanAbundance))|> + tidyr::pivot_wider(names_from='Specimen',values_from='meanAbundance',values_fn=mean,values_fill=0)|> + tibble::column_to_rownames('Genes')|>t()|> + prcomp() + +gplot<-gpcs$x|> + as.data.frame()|> + dplyr::select(PC1,PC2)|> + tibble::rownames_to_column('Specimen')|> + left_join(meta)|> + dplyr::select(PC1,PC2,Specimen,Patient,cohort)|> + mutate(cohort=as.factor(cohort))|> + distinct()|> + ggplot(aes(x=PC1,y=PC2,label=Specimen,col=Patient,shape=cohort))+ + geom_point()+ggrepel::geom_label_repel()+ggtitle("Global samples")+ + scale_color_manual(values=pcols) + + +hplot <- ggplot(glong,aes(x=meanAbundance,fill=as.factor(cohort)))+geom_histogram() + + +cowplot::plot_grid(hplot,gplot) +ggsave('cNFGlobalQC.png',width=10) + + +gplot + +ggsave('globalPCA.pdf') + +``` + + +## Evaluate batch correction + +Now we have two separate long tables with metadata, but we would like to combine into a single one and batch correct.We can update this with each cohort. + +```{r combine and correct} + +##phospho +##TODO: ideally we should use the long tables and reconvert +pmat <- merge(as.data.frame(pmat1),as.data.frame(pmat2)) + +gmat <- merge(gmat1,gmat2) + +##remove duplicated sites +dsites<- unique(pmat$site[which(duplicated(pmat$site))]) +mvals<-sapply(dsites,function(x) colSums(pmat[pmat$site==x,2:ncol(pmat)])) |> + t() |> + as.data.frame() |> + tibble::rownames_to_column('site') + +pmat <- pmat |> + subset(!site %in% dsites) |> + rbind(mvals) + +##now convert to matrix +pmat <- pmat |> + tibble::remove_rownames() |> + tibble::column_to_rownames('site') |> + as.matrix() + +gmat <- gmat |> + subset(Genes!='')|> + tibble::remove_rownames() |> + tibble::column_to_rownames('Genes') |> + as.matrix() +##sigh, batch correct? +library(sva) +# +# pmat[which(!is.finite(pmat),arr.ind=T)] <- 0.0 +# cbmat<-sva::ComBat(pmat,batch=meta$cohort,mean.only = FALSE) +# +# gmat[which(!is.finite(gmat),arr.ind=T)] <- 0.0 +# cgmat <- sva::ComBat(gmat,batch=meta$cohort,mean.only = FALSE) + + +pmat[which(!is.finite(pmat),arr.ind=T)] <- 0.0 +## align ComBat batch vector to pmat columns via filename→cohort maps +sample_meta_p <- data.frame(fname = colnames(pmat)) |> + dplyr::left_join(rbind(pfnames1, pfnames2), by = "fname") |> + dplyr::select(fname, cohort) +keep_p <- !is.na(sample_meta_p$cohort) +pmat <- pmat[, keep_p, drop = FALSE] +pbatch <- sample_meta_p$cohort[keep_p] +cbmat <- sva::ComBat(dat = pmat, batch = pbatch, mean.only = FALSE) + +gmat[which(!is.finite(gmat),arr.ind=T)] <- 0.0 +## align ComBat batch vector to gmat columns via filename→cohort maps +sample_meta_g <- data.frame(fname = colnames(gmat)) |> + dplyr::left_join(rbind(gfnames1, gfnames2), by = "fname") |> + dplyr::select(fname, cohort) +keep_g <- !is.na(sample_meta_g$cohort) +gmat <- gmat[, keep_g, drop = FALSE] +gbatch <- sample_meta_g$cohort[keep_g] +cgmat <- sva::ComBat(dat = gmat, batch = gbatch, mean.only = FALSE) + + +ppcs<-prcomp(t(cbmat)) +gpcs<-prcomp(t(cgmat)) + +``` + +# plot batch corrected data +```{r} + +# Reusable theme: white background + light grey gridlines +my_theme <- theme_bw() + + theme( + panel.grid.major = element_line(color = "grey85"), + panel.grid.minor = element_line(color = "grey92"), + panel.background = element_rect(fill = "white", color = NA), + plot.background = element_rect(fill = "white", color = NA), + legend.title = element_text(size = 9), + legend.text = element_text(size = 7), + legend.key.size = unit(0.5, "cm"), + legend.spacing.y = unit(0.2, "cm"), + legend.box.spacing= unit(0.3, "cm") + ) + + +# --- Corrected phospho plot (white bg + grey grid) --- +pplot <- ppcs$x |> + as.data.frame() |> + dplyr::select(PC1, PC2) |> + tibble::rownames_to_column("fname") |> + left_join(rbind(pfnames1, pfnames2)) |> + left_join(meta) |> + dplyr::select(PC1, PC2, Specimen, Patient, cohort) |> + mutate( + Tumor = str_extract(Specimen, "_T\\d+") |> str_remove("_"), + Tumor = factor(Tumor), + cohort = as.factor(cohort) + ) |> + distinct() |> + dplyr::filter(!is.na(Patient), !is.na(Tumor)) |> + ggplot(aes(x = PC1, y = PC2, col = Patient, shape = Tumor)) + + geom_point(size = 3) + + ggtitle("Corrected phospho samples") + + scale_color_manual(values = pcols) + + scale_shape_discrete(na.translate = FALSE) + + my_theme + +pplot + +# --- Corrected global plot (white bg + grey grid) --- +gplot <- gpcs$x |> + as.data.frame() |> + dplyr::select(PC1, PC2) |> + tibble::rownames_to_column("fname") |> + left_join(rbind(gfnames1, gfnames2)) |> + left_join(meta) |> + dplyr::select(PC1, PC2, Specimen, Patient, cohort) |> + mutate( + Tumor = str_extract(Specimen, "_T\\d+") |> str_remove("_"), + Tumor = factor(Tumor), + cohort = as.factor(cohort) + ) |> + distinct() |> + ggplot(aes(x = PC1, y = PC2, col = Patient, shape = Tumor)) + + geom_point(size = 3) + + ggtitle("Corrected global samples") + + scale_color_manual(values = pcols) + + my_theme + +gplot + + +ggsave("phosphoCorrectedPCA.pdf", plot = pplot, width = 7, height = 4.5, units = "in", device = cairo_pdf) +ggsave("globalCorrectedPCA.pdf", plot = gplot, width = 7, height = 4.5, units = "in", device = cairo_pdf) + +``` + +## Upload batch-corrected data to synapse + +Now we can reformat the batch-corrected data and upload to syanps + +```{r upload batch corrected} + +pc_long <- cbmat |> + as.data.frame() |> + tibble::rownames_to_column('site') |> + pivot_longer(-site,names_to = 'fname',values_to = 'correctedAbundance') |> + left_join(rbind(pfnames1,pfnames2)) |> + left_join(meta) |> + distinct() + +gc_long <- cgmat |> + as.data.frame() |> + tibble::rownames_to_column('Gene') |> + pivot_longer(-Gene,names_to = 'fname',values_to = 'correctedAbundance') |> + left_join(rbind(gfnames1,gfnames2)) |> + left_join(meta) |> + distinct() + + +readr::write_csv(pc_long,file = 'batch12_correctedPhospho.csv') +readr::write_csv(gc_long,file = 'batch12_correctedGlobal.csv') + +syn$store(File('batch12_correctedPhospho.csv',parentId = 'syn70078365')) +syn$store(File('batch12_correctedGlobal.csv',parentId = 'syn70078365')) + +``` diff --git a/03_analyze_modality_correlations.R b/legacy_code/03_analyze_modality_correlations.R similarity index 100% rename from 03_analyze_modality_correlations.R rename to legacy_code/03_analyze_modality_correlations.R diff --git a/legacy_code/03_drug_biomarkers.Rmd b/legacy_code/03_drug_biomarkers.Rmd new file mode 100644 index 0000000..2fa4b21 --- /dev/null +++ b/legacy_code/03_drug_biomarkers.Rmd @@ -0,0 +1,1261 @@ +--- +title: "Evaluate drug and omics data for biomarker assessment" +author: "Sara gosline" +date: "2025-03-13" +output: html_document +--- + +This document is designed to be a working document where we can compare approaches to evaluate biomarkers of drug response across patient samples. We are collecting three types of data modalities: 1. RNA Sequencing 2. Global Proteomics 3. Phospho proteomics + +We also have drug sensitivity data (single dose viability, some curves) for many drugs. The question to ask is which molecules can predict drug response across patients? How robust/extendable is this? + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +library(synapser) +library(ggplot2) +library(dplyr) +library(tidyr) + +``` + +# Pull processed files from previous markdowns + +We have already run the previous scripts and stored data on Synapse + +```{r pull files} +Sys.setenv(SYNAPSE_AUTH_TOKEN="eyJ0eXAiOiJKV1QiLCJraWQiOiJXN05OOldMSlQ6SjVSSzpMN1RMOlQ3TDc6M1ZYNjpKRU9VOjY0NFI6VTNJWDo1S1oyOjdaQ0s6RlBUSCIsImFsZyI6IlJTMjU2In0.eyJhY2Nlc3MiOnsic2NvcGUiOlsidmlldyIsImRvd25sb2FkIiwibW9kaWZ5Il0sIm9pZGNfY2xhaW1zIjp7fX0sInRva2VuX3R5cGUiOiJQRVJTT05BTF9BQ0NFU1NfVE9LRU4iLCJpc3MiOiJodHRwczovL3JlcG8tcHJvZC5wcm9kLnNhZ2ViYXNlLm9yZy9hdXRoL3YxIiwiYXVkIjoiMCIsIm5iZiI6MTc1ODMyNTgwMiwiaWF0IjoxNzU4MzI1ODAyLCJqdGkiOiIyNjE1MSIsInN1YiI6IjM0NTM5NTUifQ.VcGcVRv0P50Cb6mm7B7hGxzcWdxG4TvMhq8lRZDNEgktWNxdhMA0zacJ1jeOEilfEI-9RRpA7jE2WIM3zjIgYTL-l-UobBMKnvL_gu6itQuf2DyKR6K9OBQJER4cy7N0o6_4qwq5YPflpF6uWuvgAfskuPmQH8Yz9Z80UjeLxFw2yKUJcvtanghAWwFerOEJxxb-PDxHHC6gM_VK-HGGprPQy9_Z33dCZYcmrDbCgV5rWUV5AdTyCHhDBrx4YMw43J2U7os88SPpEEmbvxVpfSFTusOsP1FzYn7ifnpw2t6Ip5ZwLmwShBViShMlTe9X0tgjO5htY5UdXyJBMVK0Qw") + +source("cNF_helper_code.R") +traceback() + +source('cNF_helper_code.R') +##read in drug code +fits <- readr::read_tsv(synGet('syn69947322')$path) + + +##read in proteomic data +glong <- readr::read_csv(synGet('syn70078416')$path) +plong <- readr::read_csv(synGet('syn70078415')$path) + + +##read in transcrniptomic data +#TODO: process transcritpomic data into long format + + +``` + +## Format protein data to collect correlation values + +Do simple correlations to identify putative trends in the data. + +Get most efficacious, variable, and heatmap + +```{r} +# ensure an output folder + +outdir <- "figs" +if (!dir.exists(outdir)) dir.create(outdir, recursive = TRUE) + +shared <- intersect(fits$improve_sample_id, glong$Specimen) +message(sprintf("Found %d shared samples from %d drug experiments and %d proteomic experiments", +length(shared), length(unique(fits$improve_sample_id)), length(unique(glong$Specimen)))) + +glob_dat <- glong |> +ungroup() |> +subset(Specimen %in% shared) |> +dplyr::select(Specimen, Gene, correctedAbundance) |> +tidyr::pivot_wider( +names_from = "Gene", +values_from = "correctedAbundance", +values_fill = 0, +values_fn = mean +) |> +tibble::column_to_rownames("Specimen") + +phos_dat <- plong |> +ungroup() |> +subset(Specimen %in% shared) |> +dplyr::select(Specimen, site, correctedAbundance) |> +tidyr::pivot_wider( +names_from = "site", +values_from = "correctedAbundance", +values_fill = 0, +values_fn = mean +) |> +tibble::column_to_rownames("Specimen") + +## drug data to matrix here + +drug_dat <- fits |> +subset(dose_response_metric == "uM_viability") |> +dplyr::select(improve_sample_id, improve_drug_id, dose_response_value) |> +tidyr::pivot_wider( +names_from = "improve_drug_id", +values_from = "dose_response_value", +values_fn = mean +) |> +tibble::column_to_rownames("improve_sample_id") + +## summarize drugs + +drug_counts <- fits |> +subset(dose_response_metric == "uM_viability") |> +group_by(improve_drug_id) |> +distinct() |> +summarize( +meanResponse = mean(dose_response_value, na.rm = TRUE), +nMeasured = n_distinct(improve_sample_id), +variability = sd(dose_response_value, na.rm = TRUE), +.groups = "drop" +) + +# -------- Plot 1: most efficacious -------- + +p1 <- drug_counts |> +arrange(desc(meanResponse)) |> +subset(meanResponse < 0.5) |> +ggplot(aes(y = meanResponse, x = improve_drug_id, colour = nMeasured, size = variability)) + +geom_point() + +theme_minimal() + +theme( +axis.text.x = element_text(angle = 45, hjust = 1) # tilt x labels +) + +labs(title = "Most efficacious drugs", +y = "Mean cell viability (fraction)", +x = "Drug") + +ggsave(file.path("figs/most_efficacious.pdf"), p1, width = 12, height = 8, dpi = 300) + +# -------- Plot 2: most variable -------- + +p2 <- drug_counts |> +arrange(desc(variability)) |> +subset(variability > 0.15) |> +as.data.frame() |> +ggplot(aes(y = meanResponse, x = improve_drug_id, colour = nMeasured, size = variability)) + +geom_point() + +theme_minimal() + +theme( +axis.text.x = element_text(angle = 45, hjust = 1) # tilt x labels +) + +labs(title = "Most variable drugs", +y = "Mean cell viability (fraction)", +x = "Drug") + +ggsave(file.path("figs/most_variable.pdf"), p2, width = 12, height = 8, dpi = 300) + +# -------- Plot 3: heatmap of complete-measurement drugs -------- + +fulldrugs <- drug_counts |> +subset(nMeasured == nrow(drug_dat)) + +# Save large, rotate column labels, and shrink font to avoid overlap + +# pheatmap can write directly to a file via the `filename` arg. +# 1. Sort samples alphanumerically +drug_dat_alpha <- drug_dat[order(rownames(drug_dat)), , drop = FALSE] + +# 2. Get patient prefix (e.g. "NF0017" from "NF0017_T1") +samples <- rownames(drug_dat_alpha) +prefixes <- sub("_.*$", "", samples) + +# 3. Indices where the prefix changes (group boundaries) +gap_idx <- which(prefixes[-1] != prefixes[-length(prefixes)]) + +# 4. Heatmap with thin black lines + group separators, saved to file +pheatmap::pheatmap( + as.matrix(drug_dat_alpha[, fulldrugs$improve_drug_id, drop = FALSE]), + filename = file.path("figs/drug_heatmap_grouped.pdf"), + width = 28, + height = 16, + angle_col = 45, + fontsize_col = 6, + cluster_rows = FALSE, # keep alphabetical order + cluster_cols = TRUE, + show_rownames = TRUE, + show_colnames = TRUE, + gaps_row = gap_idx, # group separators + border_color = "black" # thin black grid lines +) + + + +# ---- PRINT plots in the document as well ---- + +print(p1) +print(p2) + +pheatmap::pheatmap( +as.matrix(drug_dat_alpha[, fulldrugs$improve_drug_id, drop = FALSE]), +angle_col = 45, +fontsize_col = 6, +cluster_rows = FALSE, +cluster_cols = TRUE, +show_rownames = TRUE, +show_colnames = TRUE +) + +``` + + +# Basic correlation tests + +Can we simply find and rank proteins/psites/transcripts by correlation and do enrichment? + +We can define a simple test to correlate features and drugs and assess significance and correct: + +```{r correlation tests, warning=FALSE, error=FALSE, message = FALSE} + +#this function computes correlations between all columns for each drug/feature matrix, rows are the sample identifiers +#also coputes significance +computeCors <- function(drug_dat,feat_dat,shared){ + + cres <- cor(drug_dat[shared,],feat_dat[shared,],use='pairwise.complete.obs',method='spearman') |> + as.data.frame() |> + tibble::rownames_to_column('drug')|> + tidyr::pivot_longer(cols=2:(1+ncol(feat_dat)),names_to='gene',values_to='cor') |> + arrange(desc(cor)) + + ##now lets try to get significance + csig <- do.call(rbind,lapply(colnames(drug_dat),function(x){ + do.call(rbind,lapply(colnames(feat_dat),function(y){ + pval <- 1.0 + try(pval <- cor.test(drug_dat[shared,x], + feat_dat[shared,y], + use = 'pairwise.complete.obs', + method = 'spearman')$p.value,silent = TRUE) + + return(c(corp = pval,drug = x,gene = y)) + })) |> + as.data.frame() |> + mutate(fdr=p.adjust(unlist(corp),method='fdr')) + })) |> + as.data.frame() |> + mutate(drug = unlist(drug)) |> + mutate(gene = unlist(gene)) + + fullcors <- cres|>left_join(data.frame(csig)) |> + mutate(direction=ifelse(cor<0,'neg','pos')) + + return(fullcors) +} + +``` + +Now that we have a function we can compute correlations of each data type. + +```{r compute feature cors, warning=FALSE, error=FALSE, message = FALSE} + +gcor <- computeCors(drug_dat[,fulldrugs$improve_drug_id],glob_dat,shared) |> + mutate(data='proteins') +pcor <- computeCors(drug_dat[,fulldrugs$improve_drug_id],phos_dat,shared) |> + mutate(data = 'phosphosites') + +allcor <- rbind(gcor,pcor) + +corsummary<-allcor |> subset(fdr<0.25) |> + group_by(drug,data,direction) |> + summarize(features=n(),meanCor=mean(cor)) + + +p_features <- corsummary |> + subset(features > 1) |> + ggplot(aes(x = drug,y = features,fill = direction)) + + facet_grid(data~.) + + geom_bar(position='dodge',stat='identity') + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) + +ggsave(filename = file.path("figs/cor_features_by_drug.pdf"), + plot = p_features, width = 12, height = 6, units = "in") + +corsummary |> + arrange(desc(features)) |> + subset(features > 100) |> + dplyr::select(drug,data,direction,features,meanCor) + + + +``` + +Now we have the correlation values. what do we do with them? +## Correlation based enrichment +Do not run this everytime - it is extremely slow, so its setup to run once and save the data. The next steps load this data. +```{r functional enrichment} +# === Direction-aware leapR enrichment: run "top" (up) and "bottom" (down) separately === +# Requires: glob_dat (samples x proteins), phos_dat (samples x phosphosites), fits (drug responses) +# Outputs: +# prot_enrich[[drug]]$top / $bottom +# phos_enrich[[drug]]$top / $bottom +# Optional CSVs in folder "leapR_top_paths/dir_split/" + +library(dplyr) +library(tidyr) +library(stringr) +library(SummarizedExperiment) +library(leapR) + +# ---- choose drugs to run (use your two, or set to a larger list) ---- +# target_drugs <- c("THZ1", "Onalespib") +target_drugs <- unique(fits$improve_drug_id) + +# ---- genesets ---- +data(msigdb); geneset_db <- msigdb # or ncipid +data(kinasesubstrates) + +# ---- helpers ---- +extract_gene_from_site <- function(site_id) { + if (is.na(site_id) || site_id == "") return(NA_character_) + g <- str_split(as.character(site_id), "[:_\\-\\.]")[[1]][1] + toupper(stringr::str_extract(g, "^[A-Za-z0-9]+")) +} + +# Correlate response vector vs each feature column (Spearman) +col_spearman <- function(vec, mat) { + shared <- intersect(names(vec), rownames(mat)) + if (length(shared) < 3) return(setNames(rep(NA_real_, ncol(mat)), colnames(mat))) + v <- vec[shared] + m <- as.matrix(mat[shared, , drop = FALSE]) + apply(m, 2, function(col) { + if (all(is.na(col))) return(NA_real_) + if (sd(col, na.rm = TRUE) == 0 || sd(v, na.rm = TRUE) == 0) return(NA_real_) + suppressWarnings(cor(v, col, method = "spearman", use = "pairwise.complete.obs")) + }) +} + +# Build one-column SE; column name must match primary_columns +build_se_from_corvec <- function(cor_named_vec, features_all, col_label, map_to_gene = NULL, assay_label = "proteomics") { + v <- rep(NA_real_, length(features_all)); names(v) <- features_all + common <- intersect(names(cor_named_vec), features_all) + v[common] <- cor_named_vec[common] + mat <- matrix(v, nrow = length(v), ncol = 1, dimnames = list(features_all, col_label)) + rd <- DataFrame(feature_id = features_all) + rd$hgnc_id <- if (is.null(map_to_gene)) features_all else map_to_gene[features_all] + se <- SummarizedExperiment(assays = list(values = mat), rowData = rd, colData = DataFrame(sample = col_label)) + assayNames(se) <- assay_label + se +} + +safe_leapr <- function(...) tryCatch(leapR::leapR(...), error = function(e) { message("[leapR] ", conditionMessage(e)); NULL }) + +# ---- feature and mapping vectors ---- +prot_features <- colnames(glob_dat) +phos_features <- colnames(phos_dat) +phos_to_gene <- setNames(vapply(phos_features, extract_gene_from_site, FUN.VALUE = character(1)), + phos_features) + +# ---- results containers ---- +prot_enrich <- list() +phos_enrich <- list() + +# optional: write CSVs? +write_csvs <- TRUE +outdir <- "leapR_top_paths/dir_split" +if (write_csvs && !dir.exists(outdir)) dir.create(outdir, recursive = TRUE) + +# ---- per-drug workflow ---- +for (drug in target_drugs) { + message("=== ", drug, " ===") + # build response vector (mean per sample if repeats) + dv <- fits %>% + filter(improve_drug_id == !!drug, dose_response_metric == "uM_viability") %>% + group_by(improve_sample_id) %>% summarize(resp = mean(dose_response_value, na.rm = TRUE), .groups = "drop") + if (nrow(dv) == 0) { message(" no response rows; skipping"); next } + dv_vec <- setNames(dv$resp, dv$improve_sample_id) + + # correlations + prot_cor <- col_spearman(dv_vec, glob_dat) + phos_cor <- col_spearman(dv_vec, phos_dat) + + # split by sign + prot_pos <- prot_cor[!is.na(prot_cor) & prot_cor > 0] + prot_neg <- prot_cor[!is.na(prot_cor) & prot_cor < 0] + phos_pos <- phos_cor[!is.na(phos_cor) & phos_cor > 0] + phos_neg <- phos_cor[!is.na(phos_cor) & phos_cor < 0] + + # ---- global: TOP (positives as-is), BOTTOM (negatives flipped so they rank to the top) ---- + prot_enrich[[drug]] <- list(top = NULL, bottom = NULL) + + # TOP + if (length(prot_pos) >= 5) { + se_prot_top <- build_se_from_corvec(prot_pos, prot_features, col_label = paste0(drug, "_TOP"), assay_label = "proteomics") + prot_top <- safe_leapr(geneset = geneset_db, enrichment_method = "enrichment_in_order", + eset = se_prot_top, assay_name = "proteomics", + primary_columns = paste0(drug, "_TOP")) + prot_enrich[[drug]]$top <- prot_top + if (write_csvs && !is.null(prot_top)) { + write.csv(as.data.frame(prot_top), file = file.path(outdir, paste0(drug, "_global_TOP.csv"))) + } + } else message(" PROT top: too few positive features (", length(prot_pos), ")") + + # BOTTOM (flip sign so more negative = larger positive rank) + if (length(prot_neg) >= 5) { + se_prot_bot <- build_se_from_corvec(-prot_neg, prot_features, col_label = paste0(drug, "_BOTTOM"), assay_label = "proteomics") + prot_bot <- safe_leapr(geneset = geneset_db, enrichment_method = "enrichment_in_order", + eset = se_prot_bot, assay_name = "proteomics", + primary_columns = paste0(drug, "_BOTTOM")) + prot_enrich[[drug]]$bottom <- prot_bot + if (write_csvs && !is.null(prot_bot)) { + write.csv(as.data.frame(prot_bot), file = file.path(outdir, paste0(drug, "_global_BOTTOM.csv"))) + } + } else message(" PROT bottom: too few negative features (", length(prot_neg), ")") + + # ---- PHOSPHO: TOP/BOTTOM for pathways (gene mapping via hgnc_id) ---- + phos_enrich[[drug]] <- list(top = NULL, bottom = NULL) + + # TOP + if (length(phos_pos) >= 5) { + se_phos_top <- build_se_from_corvec(phos_pos, phos_features, col_label = paste0(drug, "_TOP"), + map_to_gene = phos_to_gene, assay_label = "phosphoproteomics") + phos_top <- safe_leapr(geneset = geneset_db, enrichment_method = "enrichment_in_order", + eset = se_phos_top, assay_name = "phosphoproteomics", + primary_columns = paste0(drug, "_TOP"), id_column = "hgnc_id") + phos_enrich[[drug]]$top <- phos_top + if (write_csvs && !is.null(phos_top)) { + write.csv(as.data.frame(phos_top), file = file.path(outdir, paste0(drug, "_phospho_TOP.csv"))) + } + } else message(" PHOS top: too few positive features (", length(phos_pos), ")") + + # BOTTOM (flip) + if (length(phos_neg) >= 5) { + se_phos_bot <- build_se_from_corvec(-phos_neg, phos_features, col_label = paste0(drug, "_BOTTOM"), + map_to_gene = phos_to_gene, assay_label = "phosphoproteomics") + phos_bot <- safe_leapr(geneset = geneset_db, enrichment_method = "enrichment_in_order", + eset = se_phos_bot, assay_name = "phosphoproteomics", + primary_columns = paste0(drug, "_BOTTOM"), id_column = "hgnc_id") + phos_enrich[[drug]]$bottom <- phos_bot + if (write_csvs && !is.null(phos_bot)) { + write.csv(as.data.frame(phos_bot), file = file.path(outdir, paste0(drug, "_phospho_BOTTOM.csv"))) + } + } else message(" PHOS bottom: too few negative features (", length(phos_neg), ")") + +} + +# Save all direction-split results for later reuse +save(prot_enrich, phos_enrich, file = "leapR_enrichment_direction_split.Rdata") + +message("Finished direction-aware enrichment. Results in lists prot_enrich / phos_enrich, and CSVs (if enabled).") + + +``` + + + +For each drug, how many terms do we see active? how many kinases? +```{r functional enrichment} +# ==== Load saved enrichment & build summaries (no list-casts, no count()) ==== +library(dplyr) +library(tidyr) +library(purrr) +library(tibble) +library(ggplot2) +library(forcats) +library(stringr) +library(scales) + +# Always load the precomputed enrichment lists here +load("leapR_enrichment_direction_split.Rdata") +if (!exists("prot_enrich")) stop("prot_enrich not found in leapR_enrichment_direction_split.Rdata") +if (!exists("phos_enrich")) stop("phos_enrich not found in leapR_enrichment_direction_split.Rdata") + +alpha <- 0.05 +topN <- 15 # <<< top 15 +dirs <- c("resistant","sensitive") + +# ---------- helpers ---------- +pick_pcol <- function(df) { + cols <- colnames(df) + if ("BH_pvalue" %in% cols) return(list(kind="adj", col="BH_pvalue")) + if ("SignedBH_pvalue" %in% cols) return(list(kind="adj", col="SignedBH_pvalue")) + if ("adj.P.Val" %in% cols) return(list(kind="adj", col="adj.P.Val")) + if ("padj" %in% cols) return(list(kind="adj", col="padj")) + if ("pvalue" %in% cols) return(list(kind="raw", col="pvalue")) + if ("P.Value" %in% cols) return(list(kind="raw", col="P.Value")) + NULL +} + +extract_term_col <- function(df) { + cands <- c("term","Term","pathway","Pathway","set","Set","geneset","gene_set","Category") + hit <- cands[cands %in% names(df)] + if (length(hit)) hit[[1]] else NULL +} + +tidy_one_result <- function(x) { + if (is.null(x)) return(tibble(pathway = character(), adj_p = numeric())) + df <- as.data.frame(x) + if (!nrow(df)) return(tibble(pathway = character(), adj_p = numeric())) + + term_col <- extract_term_col(df) + if (is.null(term_col)) { + df <- tibble::rownames_to_column(df, "pathway") + } else { + df <- dplyr::mutate(df, pathway = .data[[term_col]]) + } + df$pathway <- as.character(df$pathway) + + pk <- pick_pcol(df) + if (is.null(pk)) return(tibble(pathway = character(), adj_p = numeric())) + adj <- if (pk$kind == "adj") df[[pk$col]] else p.adjust(df[[pk$col]], method = "BH") + + tibble(pathway = df$pathway, adj_p = as.numeric(adj)) |> + filter(is.finite(adj_p), !is.na(adj_p)) +} + +flatten_by_direction <- function(lst, omic_label) { + if (!length(lst)) return(tibble()) + purrr::imap_dfr(lst, function(two, drug) { + bind_rows( + tidy_one_result(two$top) |> mutate(direction = "resistant"), + tidy_one_result(two$bottom) |> mutate(direction = "sensitive") + ) |> + mutate(drug = as.character(drug), omic = omic_label) + }) +} + +# ---------- long-format enrichment and significance filter ---------- +prot_long <- flatten_by_direction(prot_enrich, "global") +phos_long <- flatten_by_direction(phos_enrich, "phospho") +enrich_long <- bind_rows(prot_long, phos_long) |> as_tibble() +stopifnot(all(c("pathway","adj_p","direction","drug","omic") %in% names(enrich_long))) + +enrich_sig <- enrich_long |> + filter(is.finite(adj_p), !is.na(adj_p), adj_p < alpha) + +if (nrow(enrich_sig) == 0) { + message("No significant pathways at FDR < ", alpha, ".") + pathway_summary <- tibble() + drug_counts <- tibble() +} else { + pathway_summary <- enrich_sig |> + group_by(omic, direction, pathway) |> + summarise(n_drugs = n_distinct(drug), .groups = "drop") |> + arrange(desc(n_drugs)) + + all_drugs <- sort(unique(enrich_long$drug)) + drug_counts <- enrich_sig |> + group_by(drug, direction) |> + summarise(n_pathways = n_distinct(pathway), .groups = "drop") |> + complete(drug = all_drugs, direction = dirs, fill = list(n_pathways = 0L)) |> + arrange(drug, direction) + + # ---------- summary figures ---------- + dir.create("figs", showWarnings = FALSE) + + reorder_within <- function(x, by, within, sep = "___") { + x2 <- paste(x, within, sep = sep); stats::reorder(x2, by) + } + scale_y_reordered_wrap <- function(width = 32, sep = "___") { + ggplot2::scale_y_discrete( + labels = function(x) stringr::str_wrap(gsub(paste0(sep, ".*$"), "", x), width = width) + ) + } + + pathway_summary_top <- pathway_summary |> + group_by(omic, direction) |> + slice_max(order_by = n_drugs, n = topN, with_ties = FALSE) |> + ungroup() |> + mutate(pathway_in_omic = reorder_within(pathway, n_drugs, omic)) + + p_pathways <- ggplot(pathway_summary_top, + aes(y = pathway_in_omic, x = n_drugs, fill = direction)) + + geom_col(position = position_dodge(width = 0.85), width = 0.85) + + facet_wrap(~ omic, scales = "free_y") + + scale_y_reordered_wrap(width = 36) + + scale_x_continuous(expand = expansion(mult = c(0, 0.05))) + + labs(title = paste0("Top ", topN, " pathways enriched across drugs"), + y = "Pathway", x = "# Drugs") + + theme_minimal(base_size = 11) + + theme( + legend.position = "top", + strip.text = element_text(face = "bold"), + axis.text.y = element_text(size = 7), + panel.grid.major.x = element_blank(), + plot.margin = margin(5.5, 30, 5.5, 5.5) + ) + + coord_cartesian(clip = "off") + + # --- PDF saves (summary figs) --- + ggsave("figs/pathways_across_drugs_top15.pdf", p_pathways, + width = 12, height = 10, units = "in", device = cairo_pdf) + print(p_pathways) + + drug_counts_full <- drug_counts |> + group_by(drug) |> + mutate(total = sum(n_pathways)) |> + ungroup() |> + mutate(drug = forcats::fct_reorder(drug, total)) + + p_counts <- ggplot(drug_counts_full, aes(x = drug, y = n_pathways, fill = direction)) + + geom_col(position = position_dodge(width = 0.9), width = 0.85) + + labs(title = "Number of enriched pathways per drug", + x = "Drug", y = "# Pathways") + + theme_minimal(base_size = 11) + + theme( + legend.position = "top", + axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 6), + panel.grid.major.x = element_blank() + ) + + ggsave("figs/enriched_pathways_per_drug_wide.pdf", p_counts, + width = 18, height = 7, units = "in", device = cairo_pdf) + print(p_counts) +} + +# ---------- Top-2 most efficacious & most variable drugs ---------- +top2_efficacious <- character(0) +top2_variable <- character(0) + +if (exists("fits")) { + eff_tbl <- fits |> + filter(dose_response_metric == "uM_viability") |> + group_by(improve_drug_id) |> + summarise( + meanResponse = mean(dose_response_value, na.rm = TRUE), + variability = sd(dose_response_value, na.rm = TRUE), + nMeasured = dplyr::n_distinct(improve_sample_id), + .groups = "drop" + ) + top2_efficacious <- eff_tbl |> + arrange(meanResponse, desc(nMeasured)) |> + slice_head(n = 2) |> + pull(improve_drug_id) + top2_variable <- eff_tbl |> + arrange(desc(variability), desc(nMeasured)) |> + slice_head(n = 2) |> + pull(improve_drug_id) +} else { + some <- unique(enrich_long$drug) + top2_efficacious <- head(some, 2) + top2_variable <- head(rev(some), 2) +} + +# ---- Force-include Onalespib in top_interest (case-insensitive) ---- +ona_matches <- unique(enrich_long$drug[grepl("^onalespib$", enrich_long$drug, ignore.case = TRUE)]) +if (length(ona_matches) == 0) ona_matches <- "Onalespib" + +top_interest <- unique(c(top2_efficacious, top2_variable, ona_matches)) +message("Top-2 most efficacious (lowest mean viability): ", paste(top2_efficacious, collapse = ", ")) +message("Top-2 most variable (highest SD): ", paste(top2_variable, collapse = ", ")) +message("Force-included: ", paste(ona_matches, collapse = ", ")) + +# ---------- per-drug pathway barplots: always top 15, star-annotate significance ---------- +pick_pcol_plot <- function(df) { + cols <- colnames(df) + if ("BH_pvalue" %in% cols) return("BH_pvalue") + if ("SignedBH_pvalue" %in% cols) return("SignedBH_pvalue") + if ("adj.P.Val" %in% cols) return("adj.P.Val") + if ("padj" %in% cols) return("padj") + if ("pvalue" %in% cols) return("pvalue") + if ("P.Value" %in% cols) return("P.Value") + NA_character_ +} + +sig_stars <- function(p) dplyr::case_when( + is.na(p) ~ "", + p < 0.001 ~ "***", + p < 0.01 ~ "**", + p < 0.05 ~ "*", + TRUE ~ "" +) + +prep_plot_df <- function(res_df, n = 15) { + if (is.null(res_df)) return(NULL) + df <- as.data.frame(res_df); if (!nrow(df)) return(NULL) + if (!("feature" %in% names(df))) df <- tibble::rownames_to_column(df, "feature") + col <- pick_pcol_plot(df); if (is.na(col)) return(NULL) + + # Always compute BH adj p; then take top 15 by smallest adj_p (no significance filter) + adj_p <- if (col %in% c("pvalue","P.Value")) p.adjust(df[[col]], method = "BH") else df[[col]] + + df |> + mutate( + adj_p = as.numeric(adj_p), + score = -log10(pmax(adj_p, 1e-300)), + stars = sig_stars(adj_p), + signif = !is.na(adj_p) & adj_p < 0.05, + feature = stringr::str_wrap(as.character(feature), width = 40) + ) |> + filter(is.finite(adj_p)) |> + arrange(adj_p, desc(score)) |> + slice_head(n = n) +} + +plot_bar <- function(df_plot, title_text, label_type) { + if (is.null(df_plot) || !nrow(df_plot)) return(NULL) + lbl <- ifelse(label_type == "top", "Resistant pathways", "Sensitive pathways") + fillc <- ifelse(label_type == "top", "#D7191C", "#2C7BB6") + star_pad <- 0.15 + + ggplot(df_plot, aes(x = reorder(feature, score), y = score)) + + geom_col(aes(alpha = signif), fill = fillc, width = 0.85) + + scale_alpha_manual(values = c(`FALSE` = 0.5, `TRUE` = 1), guide = "none") + + geom_text(aes(y = score + star_pad, label = stars), + size = 3, hjust = 0) + + coord_flip(clip = "off") + + scale_y_continuous(expand = expansion(mult = c(0, 0.15))) + + labs(title = paste0(title_text, ", ", lbl), + x = NULL, y = expression(-log[10]("FDR"))) + + theme_minimal(base_size = 9) + + theme( + plot.title = element_text(size = 11), + axis.text.y = element_text(size = 7), + axis.text.x = element_text(size = 8), + plot.margin = margin(5.5, 30, 5.5, 5.5) + ) +} + +safelabel <- function(x) gsub("[^A-Za-z0-9_.-]", "_", x) + +dir.create("figs", showWarnings = FALSE) +for (drug in top_interest) { + pt <- if (drug %in% names(prot_enrich)) prot_enrich[[drug]]$top else NULL + pb <- if (drug %in% names(prot_enrich)) prot_enrich[[drug]]$bottom else NULL + ft <- if (drug %in% names(phos_enrich)) phos_enrich[[drug]]$top else NULL + fb <- if (drug %in% names(phos_enrich)) phos_enrich[[drug]]$bottom else NULL + + p1 <- plot_bar(prep_plot_df(pt, n = 15), paste0(drug, " - global"), "top") + p2 <- plot_bar(prep_plot_df(pb, n = 15), paste0(drug, " - global"), "bottom") + p3 <- plot_bar(prep_plot_df(ft, n = 15), paste0(drug, " - Phospho"), "top") + p4 <- plot_bar(prep_plot_df(fb, n = 15), paste0(drug, " - Phospho"), "bottom") + + if (!is.null(p1)) { + ggsave(file.path("figs", paste0("pathways_", safelabel(drug), "_global_resistant_top15.pdf")), + p1, width = 7, height = 5, device = cairo_pdf); print(p1) + } + if (!is.null(p2)) { + ggsave(file.path("figs", paste0("pathways_", safelabel(drug), "_global_sensitive_top15.pdf")), + p2, width = 7, height = 5, device = cairo_pdf); print(p2) + } + if (!is.null(p3)) { + ggsave(file.path("figs", paste0("pathways_", safelabel(drug), "_phospho_resistant_top15.pdf")), + p3, width = 7, height = 5, device = cairo_pdf); print(p3) + } + if (!is.null(p4)) { + ggsave(file.path("figs", paste0("pathways_", safelabel(drug), "_phospho_sensitive_top15.pdf")), + p4, width = 7, height = 5, device = cairo_pdf); print(p4) + } +} + + + +``` + + + + +# Print siginficant results for all drugs if we want. +```{r} +library(dplyr) +library(ggplot2) +library(tibble) +library(rlang) + +alpha <- 0.05 # significance threshold +top_n_to_show <- 15 + +# Prefer adjusted p if available; fall back to raw and adjust per-run +pick_pcol <- function(df) { + cols <- colnames(df) + if ("BH_pvalue" %in% cols) return(list(kind="adj", col="BH_pvalue")) + if ("SignedBH_pvalue" %in% cols) return(list(kind="adj", col="SignedBH_pvalue")) + if ("pvalue" %in% cols) return(list(kind="raw", col="pvalue")) + return(NULL) +} + +prep_plot_df <- function(res_df, n = top_n_to_show) { + if (is.null(res_df)) return(NULL) + df <- as.data.frame(res_df) + if (!nrow(df)) return(NULL) + + pick <- pick_pcol(df) + if (is.null(pick)) return(NULL) + + # unify to adj p + if (pick$kind == "adj") { + df <- df %>% mutate(adj_p = !!sym(pick$col)) + } else { # raw p → adjust within this run + df <- df %>% mutate(adj_p = p.adjust(!!sym(pick$col), method = "BH")) + } + + df %>% + rownames_to_column("feature") %>% + arrange(adj_p) %>% + # keep only significant ones; if none, return empty (caller will message) + filter(is.finite(adj_p), !is.na(adj_p), adj_p < alpha) %>% + head(n) %>% + mutate(score = -log10(pmax(adj_p, 1e-300))) +} + +plot_bar <- function(df_plot, title_text, label_type) { + if (is.null(df_plot) || !nrow(df_plot)) { + message(" No significant pathways for ", title_text, " (FDR<", alpha, ").") + return(NULL) + } + # Correct labels & colors for uM_viability convention: + # TOP -> resistant (red), BOTTOM -> sensitive (blue) + lbl <- ifelse(label_type == "top", "Resistant pathways", "Sensitive pathways") + fillc <- ifelse(label_type == "top", "#D7191C", "#2C7BB6") + + p <- ggplot(df_plot, aes(x = reorder(feature, score), y = score)) + + geom_col(fill = fillc) + + coord_flip() + + labs(title = paste0(title_text, ", ", lbl), + x = NULL, y = expression(-log[10]("FDR"))) + + theme_minimal(base_size = 9) + + theme( + plot.title = element_text(size = 11), + axis.text.y = element_text(size = 6), + axis.text.x = element_text(size = 8) + ) + print(p); invisible(p) +} + +plot_drug_panels <- function(drug) { + message("\n=== ", drug, " ===") + + # global + pt <- if (drug %in% names(prot_enrich)) prot_enrich[[drug]]$top else NULL + pb <- if (drug %in% names(prot_enrich)) prot_enrich[[drug]]$bottom else NULL + plot_bar(prep_plot_df(pt), paste0(drug, ", Global Proteomics"), "top") + plot_bar(prep_plot_df(pb), paste0(drug, ", Global Proteomics"), "bottom") + + # Phospho pathways + ft <- if (drug %in% names(phos_enrich)) phos_enrich[[drug]]$top else NULL + fb <- if (drug %in% names(phos_enrich)) phos_enrich[[drug]]$bottom else NULL + plot_bar(prep_plot_df(ft), paste0(drug, ", Phosphoproteomics"), "top") + plot_bar(prep_plot_df(fb), paste0(drug, ", Phosphoproteomics"), "bottom") +} + + +# --- run for your drugs --- +for (d in target_drugs) plot_drug_panels(d) + +``` + + + + +# Basic drug list +```{r} +drug_counts <- fits %>% + filter(dose_response_metric == "uM_viability") %>% + group_by(improve_drug_id) %>% + summarise( + n_rows = dplyr::n(), # total rows/measurements + n_specimens = n_distinct(improve_sample_id), # unique samples tested + meanResponse = mean(dose_response_value, na.rm = TRUE), + sdResponse = sd(dose_response_value, na.rm = TRUE), + .groups = "drop" + ) %>% + arrange(desc(n_specimens), improve_drug_id) + +# Plain list of drugs + total count +drug_list <- sort(unique(drug_counts$improve_drug_id)) +n_drugs <- length(drug_list) + +message(sprintf("Total unique drugs: %d", n_drugs)) +print((drug_list)) + +``` + + + + + + + + + + + + + + + + + + + + + +## Visualization +How should we visualize? Here is some older code +```{r plot cors, eval=FALSE} + +plotCors <- function(features,druglist,dataType='proteins'){ + ##subset a list of features and drugs and plot those in a graph + require(ggplot2) + if(dataType=='proteins'){ + ptab<-glong|>dplyr::rename(feature='Gene') + }else{ + ptab<-plong|>dplyr::rename(feature='site') + } + dtab<-fits|> + subset(dose_response_metric=='uM_viability')|> + dplyr::rename(Specimen='improve_sample_id',Drug='improve_drug_id')|> + subset(Drug%in%druglist) + + + ftab<-features|>left_join(ptab)|>left_join(dtab)|> + subset(!is.na(Drug)) + + feats <- unique(features$feature) + plots <- lapply(feats,function(x){ + corval <- ftab[ftab$feature==x,'cor'] + #corval <- ftab[ftab$feature==x,'pCor'] + + ftab|>subset(feature==x)|> + ggplot(aes(x=correctedAbundance,y=dose_response_value, + col=Patient,size=1))+ + geom_point()+ + facet_grid(~Drug)+ + ggtitle(paste(x,'Drug correlation'))+ + scale_color_manual(values=pcols) + }) + cowplot::plot_grid(plotlist= plots,ncol=2) + +} + +druglist<-c('Onalespib') +features<-subset(allcor,drug%in%druglist)|> + subset(fdr<0.25)|> + subset(abs(cor)>0.7)|> + subset(data=='proteins')|> + arrange(desc(abs(cor))) + +plotCors(rename(features[1:10,],feature='gene'),druglist) + +ggsave('onalespibFDR0.25Cors.pdf',height=20) + +``` + +# Random forest predictor + +Here we try to use random forest to extract predictive features. First we need to assess if the model can accurately predict drug response from the data. From those predictive models, we can extract features/biomarkers. + +First we build the data frames needed - I've included cohort as a covariate but may remove it. + +```{r random forest} +## separate out cohorts for prediction +cohorts <- meta |> + select(Specimen,cohort) |> + distinct() |> + tibble::column_to_rownames('Specimen') + +##for each drug, build model of cohort + protein ~ drug response +#removed cohort for now +gdf <- as.data.frame(glob_dat)#|>mutate(cohort=cohorts[rownames(glob_dat),'cohort']) +gnas <- which(apply(gdf,2,function(x) any(is.na(x)))) + +pdf <- as.data.frame(phos_dat)#|> mutate(cohort=cohorts[rownames(phos_dat),'cohort']) +pnas <- which(apply(pdf,2,function(x) any(is.na(x)))) + +mdf <- meta[-c(2,5),]|> ##have duplication here + tibble::column_to_rownames('Specimen') + + +``` + +Now we can loop through every drug, build model, and assess accuracy. + +```{r evaluate predictivty} + +#trying ou tthis function tos ee how it goes +rfFeatures <- function(drug_dat,fdf, mdf){ + complete_drugs <- which(apply(drug_dat,2, + function(x) length(which(!is.na(x)))==length(x))) + print(paste("Evaluating random forest for ",length(complete_drugs),'drugs')) + all_preds <- do.call(rbind,lapply(names(complete_drugs),function(drug){ + + dg <- fdf#[,-gnas] + + ##create the metadata df with the drug of interest + dmdf <- mdf |> + mutate(drug=drug_dat[rownames(mdf),drug]) + + rf <- randomForest::randomForest(x=dg, + y=drug_dat[rownames(dg),drug], + importance=TRUE,ntree=500) + + im <- randomForest::importance(rf)|> + as.data.frame() |> + mutate(drug=drug) + pord <- intersect(rownames(mdf)[order(drug_dat[rownames(mdf),drug])],rownames(glob_dat)) + + #pheatmap::pheatmap(t(glob_dat[pord, + # rownames(im)]),annotation_col=dmdf, + # cellheight=10,cluster_cols = TRUE) + return(im) + ##what do we return?x + })) + return(all_preds) +} + + +``` + +Now what do we do with the importance features? + +```{r rf processing} + +##get importance for global +gimp <- rfFeatures(drug_dat=drug_dat,fdf=gdf,mdf=mdf) + +##get importance for phospho +pimp <- rfFeatures(drug_dat=drug_dat,fdf=pdf,mdf=mdf) + +``` + + + + + + + + + + + + + + + + + +# Old correlation code, dont run + +now we can visualize correlations + +```{r check out HSP90s, eval=FALSE} + +hsps<-unique(glong$Genes[grep('^HSP',glong$Genes)]) + +cor_hsps<-subset(allcor,gene%in%hsps)|>subset(drug=='Onalespib')|>subset(corp<0.2) +#print(paste('measured',length(hsps),'HSPs in global data of which', nrow(cor_hsps),' are correlated with Onalespib')) + +plotCors(rename(cor_hsps,feature='gene'),c('Onalespib')) + +ggsave('hspCorsOna.pdf',height=nrow(cor_hsps)*3) + +hspp<-unique(plong$site[grep('^HSP',plong$site)]) +cor_hspps<-subset(allcor,gene%in%hspp)|>subset(drug=='Onalespib')|>subset(corp<0.2) +#print(paste('measured',length(hspp),'HSPs in phospho data of which',nrow(cor_hspps),' are correlated with Onalespib')) + +plotCors(rename(cor_hspps,feature='gene'),c('Onalespib'),'phospho') + +ggsave('hspPhosphoCorsOna.pdf',height=nrow(cor_hspps)*3) + + +``` + +### Now lets look only at IC50 values + +There are a few drugs for which we have IC50 values + +```{r check ic50 cors,warning=FALSE,error=FALSE, eval=FALSE} + +ifits<-subset(fits,dose_response_metric=='fit_ic50') + +shared<-intersect(ifits$improve_sample_id,glong$Specimen) +print(paste('Found',length(shared),'shared samples')) + +## a full join might be a challenge, maybe just take two matrices +drug_dat <- ifits|> + dplyr::select(improve_sample_id,improve_drug_id,dose_response_value)|> + tidyr::pivot_wider(names_from='improve_drug_id',values_from='dose_response_value',values_fn=mean)|> + tibble::column_to_rownames('improve_sample_id') + +gres<-cor(drug_dat[shared,],glob_dat[shared,],use='pairwise.complete.obs',method='pearson')|> + as.data.frame()|> + tibble::rownames_to_column('drug')|> + tidyr::pivot_longer(cols=2:(1+ncol(glob_dat)),names_to='gene',values_to='pCor')|> + arrange(desc(pCor)) + +##now lets try to get significance + +gsig<-do.call(rbind,lapply(colnames(drug_dat),function(x){ + do.call(rbind,lapply(colnames(glob_dat),function(y){ + pval<-1.0 + try(pval<-cor.test(drug_dat[shared,x],glob_dat[shared,y],use='pairwise.complete.obs',method='pearson')$p.value,silent=TRUE) + return(c(corp=pval,drug=x,gene=y)) + } + ))|>as.data.frame()|> + mutate(fdr=p.adjust(unlist(corp),method='fdr')) + }))|> + as.data.frame()|> + mutate(drug=unlist(drug))|> + mutate(gene=unlist(gene)) + +fullcors<-gres|>left_join(data.frame(gsig))|>mutate(data='global') + +pres<-cor(drug_dat[shared,],phos_dat[shared,],use='pairwise.complete.obs',method='pearson')|> + as.data.frame()|> + tibble::rownames_to_column('drug')|> + tidyr::pivot_longer(cols=2:(1+ncol(phos_dat)),names_to='site',values_to='pCor')|> + arrange(desc(pCor)) + +##now lets look at correlations + +psig<-do.call(rbind,lapply(colnames(drug_dat),function(x){ + do.call(rbind,lapply(colnames(phos_dat),function(y){ + pval<-1.0 + try(pval<-cor.test(drug_dat[shared,x],phos_dat[shared,y],use='pairwise.complete.obs',method='pearson')$p.value,silent=TRUE) + return(c(corp=pval,drug=x,gene=y)) + } + ))|>as.data.frame()|> + mutate(fdr=p.adjust(unlist(corp),method='fdr')) + }))|>as.data.frame() + +fullpcors<-pres|>rename(gene='site')|>left_join(data.frame(psig))|>mutate(data='phospho') + +#combine all correlations +allcor<-rbind(fullcors,fullpcors)|> + mutate(direction=ifelse(pCor<0,'neg','pos')) + + +##lets count the correlations and plot + +corsummary<-allcor|>subset(fdr<0.1)|> + group_by(drug,data,direction)|> + summarize(features=n(),meanCor=mean(pCor)) + +corsummary|> + #subset(features>1)|> + ggplot(aes(x=data,y=features,fill=drug))+ + facet_grid(~direction)+ + geom_bar(position='dodge',stat='identity') + + + +``` + +Again we have onalespib with numerous significantly correlated proteins, and one phosphosite for digoxin showing up . + +```{r plot individual sites, eval=FALSE} + +druglist<-c('Onalespib') +features<-subset(allcor,drug%in%druglist)|> + subset(fdr<0.05)|> + subset(data=='global') + +plotCors(rename(features,feature='gene'),druglist) + + +druglist<-c('Digoxin') +features<-subset(allcor,drug%in%druglist)|> + subset(fdr<0.1)|> + subset(data=='phospho') + +#plotCors(rename(features,feature='gene'),druglist,data='phospho') + + + +``` + +Now we can check the HSP proteins directly + +```{r HSP correlation, eval=FALSE} +hsps<-unique(glong$Genes[grep('^HSP',glong$Genes)]) + +cor_hsps<-subset(allcor,gene%in%hsps)|>subset(drug=='Onalespib')|>subset(corp<0.05) + +print(paste('measured',length(hsps),'HSPs in global data of which', nrow(cor_hsps),' are correlated with Onalespib')) + +plotCors(rename(cor_hsps,feature='gene'),'Onalespib') + +hspp<-unique(plong$site[grep('^HSP',plong$site)]) +cor_hspps<-subset(allcor,gene%in%hspp)|>subset(drug=='Onalespib')|>subset(corp<0.1) + +print(paste('measured',length(hspp),'HSPs in phospho data of which',nrow(cor_hspps),' are correlated with Onalespib')) +``` + +The IC50 result is similar to the viability. Now we can check AUC + +```{r auc hsp check, error=FALSE, warning=FALSE, eval=FALSE} + +ifits<-subset(fits,dose_response_metric=='auc') + +shared<-intersect(ifits$improve_sample_id,glong$Specimen) +print(paste('Found',length(shared),'shared samples')) + +## a full join might be a challenge, maybe just take two matrices +drug_dat <- ifits|> + dplyr::select(improve_sample_id,improve_drug_id,dose_response_value)|> + tidyr::pivot_wider(names_from='improve_drug_id',values_from='dose_response_value',values_fn=mean)|> + tibble::column_to_rownames('improve_sample_id') + +gres<-cor(drug_dat[shared,],glob_dat[shared,],use='pairwise.complete.obs',method='pearson')|> + as.data.frame()|> + tibble::rownames_to_column('drug')|> + tidyr::pivot_longer(cols=2:(1+ncol(glob_dat)),names_to='gene',values_to='pCor')|> + arrange(desc(pCor)) + +##now lets try to get significance + +gsig<-do.call(rbind,lapply(colnames(drug_dat),function(x){ + do.call(rbind,lapply(colnames(glob_dat),function(y){ + pval<-1.0 + try(pval<-cor.test(drug_dat[shared,x],glob_dat[shared,y],use='pairwise.complete.obs',method='pearson')$p.value,silent=TRUE) + return(c(corp=pval,drug=x,gene=y)) + } + ))|>as.data.frame()|> + mutate(bh_p=p.adjust(unlist(corp),method='BH')) + }))|> + as.data.frame()|> + mutate(drug=unlist(drug))|> + mutate(gene=unlist(gene)) + +fullcors<-gres|>left_join(data.frame(gsig))|>mutate(data='global') + +pres<-cor(drug_dat[shared,],phos_dat[shared,],use='pairwise.complete.obs',method='pearson')|> + as.data.frame()|> + tibble::rownames_to_column('drug')|> + tidyr::pivot_longer(cols=2:(1+ncol(phos_dat)),names_to='site',values_to='pCor')|> + arrange(desc(pCor)) + +##now lets look at correlations + +psig<-do.call(rbind,lapply(colnames(drug_dat),function(x){ + do.call(rbind,lapply(colnames(phos_dat),function(y){ + pval<-1.0 + try(pval<-cor.test(drug_dat[shared,x],phos_dat[shared,y],use='pairwise.complete.obs',method='pearson')$p.value,silent=TRUE) + return(c(corp=pval,drug=x,gene=y)) + } + ))|>as.data.frame()|> + mutate(bh_p=p.adjust(unlist(corp),method='BH')) + }))|>as.data.frame() + +fullpcors<-pres|>rename(gene='site')|>left_join(data.frame(psig))|>mutate(data='phospho') + +#combine all correlations +allcor<-rbind(fullcors,fullpcors)|> + mutate(direction=ifelse(pCor<0,'neg','pos')) + + +##lets count the correlations and plot + +corsummary<-allcor|>subset(bh_p<0.1)|> + group_by(drug,data,direction)|> + summarize(features=n(),meanCor=mean(pCor)) + +corsummary|> + #subset(features>1)|> + ggplot(aes(x=data,y=features,fill=drug))+facet_grid(~direction)+geom_bar(position='dodge',stat='identity') + +print(corsummary) + +``` + +```{r HSP correlation again, eval=FALSE} +hsps<-unique(glong$Genes[grep('^HSP',glong$Genes)]) + +cor_hsps<-subset(allcor,gene%in%hsps)|>s +ubset(drug=='Onalespib')|>subset(corp<0.05) +print(paste('measured',length(hsps),'HSPs in global data of which', nrow(cor_hsps),' are correlated with Onalespib')) +cor_hsps + +plotCors(rename(cor_hsps, feature='gene'),'Onalespib') + +hspp<-unique(plong$site[grep('^HSP',plong$site)]) +cor_hspps<-subset(allcor,gene%in%hspp)|>subset(drug=='Onalespib')|>subset(corp<0.1) +print(paste('measured',length(hspp),'HSPs in phospho data of which',nrow(cor_hspps),' are correlated with Onalespib')) +``` diff --git a/03_drug_biomarkers_legacy_code.Rmd b/legacy_code/03_drug_biomarkers_legacy_code.Rmd similarity index 100% rename from 03_drug_biomarkers_legacy_code.Rmd rename to legacy_code/03_drug_biomarkers_legacy_code.Rmd From 08f7fe7e6baa3876e57557969641c0aa360aacdf Mon Sep 17 00:00:00 2001 From: Jeremy Date: Fri, 5 Dec 2025 09:16:50 -0800 Subject: [PATCH 07/13] moved 03_analyze_modality_correlations.R out of legacy code --- ..._modality_correlations.R => 03_analyze_modality_correlations.R | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename legacy_code/03_analyze_modality_correlations.R => 03_analyze_modality_correlations.R (100%) diff --git a/legacy_code/03_analyze_modality_correlations.R b/03_analyze_modality_correlations.R similarity index 100% rename from legacy_code/03_analyze_modality_correlations.R rename to 03_analyze_modality_correlations.R From f03c897e9e927deee538e1da0cafd22acefb5ad4 Mon Sep 17 00:00:00 2001 From: Jeremy Date: Mon, 12 Jan 2026 13:56:18 -0800 Subject: [PATCH 08/13] Most of the requested changes have been addressed. Code is currently being tested so there may be more updates --- cNF_helper_code.R => 00_cNF_helper_code.R | 2 +- ...ics.R => 01_normalize_batchcorrect_omics.R | 67 ++- ...ns.R => 02_analyze_modality_correlations.R | 32 +- 04_leapr_biomarker.R => 03_leapr_biomarker.R | 141 ++++- README.md | 12 - .../01_run_normalize_omics.Rmd | 136 +++-- analysis/02_analyze_modality.Rmd | 78 +++ .../03_pathway_enrichment.Rmd | 71 +-- analysis/info.md | 489 ++++++++++++++++++ 9 files changed, 855 insertions(+), 173 deletions(-) rename cNF_helper_code.R => 00_cNF_helper_code.R (56%) rename 02_normalize_batchcorrect_omics.R => 01_normalize_batchcorrect_omics.R (89%) rename 03_analyze_modality_correlations.R => 02_analyze_modality_correlations.R (85%) rename 04_leapr_biomarker.R => 03_leapr_biomarker.R (74%) rename 02_run_normalize_omics.Rmd => analysis/01_run_normalize_omics.Rmd (53%) create mode 100644 analysis/02_analyze_modality.Rmd rename 04_analyze_modality_and_pathway_enrich.Rmd => analysis/03_pathway_enrichment.Rmd (56%) create mode 100644 analysis/info.md diff --git a/cNF_helper_code.R b/00_cNF_helper_code.R similarity index 56% rename from cNF_helper_code.R rename to 00_cNF_helper_code.R index 9ae26c5..4f4eaf9 100644 --- a/cNF_helper_code.R +++ b/00_cNF_helper_code.R @@ -3,7 +3,7 @@ library(synapser) -synLogin() +synLogin(authToken = "eyJ0eXAiOiJKV1QiLCJraWQiOiJXN05OOldMSlQ6SjVSSzpMN1RMOlQ3TDc6M1ZYNjpKRU9VOjY0NFI6VTNJWDo1S1oyOjdaQ0s6RlBUSCIsImFsZyI6IlJTMjU2In0.eyJhY2Nlc3MiOnsic2NvcGUiOlsidmlldyIsImRvd25sb2FkIl0sIm9pZGNfY2xhaW1zIjp7fX0sInRva2VuX3R5cGUiOiJQRVJTT05BTF9BQ0NFU1NfVE9LRU4iLCJpc3MiOiJodHRwczovL3JlcG8tcHJvZC5wcm9kLnNhZ2ViYXNlLm9yZy9hdXRoL3YxIiwiYXVkIjoiMCIsIm5iZiI6MTc2MTgzOTMwNCwiaWF0IjoxNzYxODM5MzA0LCJqdGkiOiIyNzg5OSIsInN1YiI6IjM0NTM5NTUifQ.OkzJaWKY7po87mP_iKDNP8ah--BsZpb-mH8U1wP2kj2yqnxi65yi1irAgNNITXB-mQ84ndpdV-9SfNCpkc-pUIenHADpLTMPbh2dMLAiCS77picR-bLlb79c7e005L-7EWuMBPh7RXAXFOcEwY3823Z-RsjKatGYZXCiGaC_I0OPuawh1c3y_c2VVsZUFMdfXJaQH4iaMB5wK9OA_OlI_mnwaYrL1C3ID6ZCuGdO8cR9-fwgu5NzZPq5OOyn11BSblb7NytwaEhhuteHediXBnhGSmfSCUCpQvdpppb03_-Q8DK21oyAgUSygn_h7QFIqtdfzyb8mg300D49al-yAg") syn <- list(get = synapser::synGet, store = synapser::synStore) library(readxl) library(tidyr) diff --git a/02_normalize_batchcorrect_omics.R b/01_normalize_batchcorrect_omics.R similarity index 89% rename from 02_normalize_batchcorrect_omics.R rename to 01_normalize_batchcorrect_omics.R index 5fe3611..aa00a06 100644 --- a/02_normalize_batchcorrect_omics.R +++ b/01_normalize_batchcorrect_omics.R @@ -1,9 +1,38 @@ # ============================================================================= # normalize_omics_pipeline.R -# Arguments: -# - write_outputs: master on/off for writing CSVs, PDFs, and uploading -# - save_basename: override base name used in filenames -# - do_batch_correct: skip ComBat when FALSE; use combined matrix instead +# +# Main entry point: +# run_modality(modality, batches, meta, syn, ...) +# +# Key inputs (with examples): +# +# Example batches value: +# batches <- list( +# list(syn_id = "syn69963552", cohort = 1, value_start_col = 5, fname_aliquot_index = 8), +# list(syn_id = "syn69947351", cohort = 2, value_start_col = 5, fname_aliquot_index = 9) +# ) +# +# Arguments: +# - modality: Which data type to run: "phospho", "global", or "rna" +# - batches: List of batch configs; each element should include at least: +# * syn_id: Synapse file ID for the wide feature×sample table +# * cohort: Cohort/batch label used for joining meta and for ComBat batching +# Optional per-batch fields: +# * value_start_col: Column index where sample measurement columns begin (auto-detected if NULL) +# * fname_aliquot_index: Token index (split on "_") used to parse aliquot number from sample filenames +# - meta: Sample metadata table used to join batch sample IDs to Patient/Tumor/Specimen +# (cnF_helper_code.R creates this.) +# - syn: Synapse client object used for Synapse reading/upload (synapser) +# - drop_name_substrings: Regex pattern(s); sample columns matching any pattern will be removed (QC/blank runs) +# - out_dir: Output directory for generated CSV/PDF files +# - out_prefix: Default base name for output files; if NULL (recommended), derived from modality (sanitized) +# - upload_parent_id: Synapse project/folder ID to upload outputs into +# (ignored if NULL or write_outputs=FALSE) +# - pcols: Color vector for PCA plotting (names should match Patient IDs) +# (cnF_helper_code.R creates this.) +# - write_outputs: Master toggle to write CSV/PDF outputs and perform uploads +# - save_basename: Override base name used in output filenames (supersedes out_prefix) +# - do_batch_correct: If FALSE, skip ComBat and use combined matrix instead # ============================================================================= suppressPackageStartupMessages({ @@ -590,19 +619,25 @@ perform_uploads <- function(paths, syn, parent_id) { ##### # This is how we call the function / pipeline +# Example batches value: +# batches <- list( +# list(syn_id = "syn69963552", cohort = 1, value_start_col = 5, fname_aliquot_index = 8), +# list(syn_id = "syn69947351", cohort = 2, value_start_col = 5, fname_aliquot_index = 9) +# ) + run_modality <- function( - modality, - batches, - meta, - syn, - drop_name_substrings = NULL, - out_dir = ".", - out_prefix = NULL, - upload_parent_id = NULL, - pcols = NULL, - write_outputs = TRUE, # master toggle to write CSV/PDF & upload - save_basename = NULL, # override base name used in output files - do_batch_correct = TRUE # if FALSE, skip ComBat and use combined matrix + modality, # Which data type to run: "phospho", "global", or "rna" + batches, # List of batch configs (each element should include at least: syn_id, cohort; optionally: value_start_col, fname_aliquot_index) + meta, # Sample metadata table used to join batch sample IDs to Patient/Tumor/Specimen - cnF_helper_code.R creates this. + syn, # Synapse client object used for synapse reading/upload (synapser) + drop_name_substrings = NULL, # Regex pattern - sample columns matching any pattern will be removed (QC/blank runs) + out_dir = ".", # Output directory for generated CSV/PDF files + out_prefix = NULL, # Default base name for output files; if NULL (recommended), derived from modality (sanitized) + upload_parent_id = NULL, # Synapse project ID to upload outputs into (ignored if NULL or write_outputs=FALSE) + pcols = NULL, # Color vector for PCA plotting (names should match Patient IDs) - cnF_helper_code.R creates this. + write_outputs = TRUE, # master toggle to write CSV/PDF & upload + save_basename = NULL, # override base name used in output files + do_batch_correct = TRUE # if FALSE, skip ComBat and use combined matrix ) { message("==================================================") message("Starting run_modality(): ", modality) diff --git a/03_analyze_modality_correlations.R b/02_analyze_modality_correlations.R similarity index 85% rename from 03_analyze_modality_correlations.R rename to 02_analyze_modality_correlations.R index 0ec90c1..ff3d602 100644 --- a/03_analyze_modality_correlations.R +++ b/02_analyze_modality_correlations.R @@ -1,4 +1,34 @@ -# analyze_modality.R +# --------------------------------------------------------------------------- +# 02_analyze_modality.R +# --------------------------------------------------------------------------- +# Purpose +# - Given (1) drug response fits and (2) a long-format omics table for one modality, +# this script builds sample x drug and sample x feature matrices, makes a few summary +# plots (drug efficacy/variability + optional heatmap), and computes Spearman +# correlations between drug response and molecular features. +# +# Main entry +# - analyze_modality(fits, df_long, sample_col, feature_col, value_col, ...) +# +# Inputs +# - fits: long drug response table with improve_sample_id, improve_drug_id, +# dose_response_metric, dose_response_value +# - df_long: long omics table with sample IDs + feature IDs + values +# - sample_col / feature_col / value_col: column names in df_long that identify +# the sample, the molecular feature, and the measurement to analyze +# +# Outputs (written to outdir) +# - most_efficacious.pdf, most_variable.pdf +# - drug_heatmap_large.pdf (optional; only for drugs measured in all samples) +# - cor_features_by_drug.pdf (counts of significant correlated features per drug) +# +# Returns (as a list) +# - drug_mat, feat_mat: wide matrices used for analysis +# - cor_tbl: per drug-feature correlations (Spearman) + p-values + FDR +# - cor_summary / cor_plot: summary of significant correlations +# - drug_summary: per-drug mean response, # measured, and variability +# --------------------------------------------------------------------------- + suppressPackageStartupMessages({ library(dplyr) library(tidyr) diff --git a/04_leapr_biomarker.R b/03_leapr_biomarker.R similarity index 74% rename from 04_leapr_biomarker.R rename to 03_leapr_biomarker.R index f679d82..9658046 100644 --- a/04_leapr_biomarker.R +++ b/03_leapr_biomarker.R @@ -1,4 +1,39 @@ -# leapr_biomarker.R +# --------------------------------------------------------------------------- +# 03_leapr_biomarker.R +# --------------------------------------------------------------------------- +# Purpose +# - For each drug, correlate drug response (uM_viability) with omics features across +# samples, then use leapR to find enriched pathways/genesets among: +# * TOP = features positively correlated with viability (more resistant) +# * BOTTOM = features negatively correlated with viability (more sensitive) +# +# Main entry +# - run_leapr_directional_one_cached(drugs, df_long, sample_col, feature_col, value_col, omic_label, cache_path, ...) +# +# Inputs +# - drugs: long drug-response table with improve_drug_id, improve_sample_id, +# dose_response_metric, dose_response_value +# - df_long: long omics table (sample/feature/value columns) +# - sample_col / feature_col / value_col: column names in df_long that identify +# the sample ID, feature ID, and numeric measurement +# - omic_label: label used for reporting/assay naming (e.g., "global", "rna", "phospho") +# - cache_path: .RData file path used to save/load results (skips recompute unless always_rerun=TRUE) +# +# Options +# - geneset_name / geneset_object: choose the leapR geneset DB (defaults depend on omic_label) +# - min_features: minimum number of correlated features required to run leapR for TOP/BOTTOM +# - write_csvs: write per-drug leapR tables to CSV +# - test_one: run only the first drug - make sure things are actually working. +# +# Outputs +# - Cached results saved to cache_path (if provided) +# - Optional CSVs: leapR_top_paths/dir_split/*_{TOP|BOTTOM}.csv (if write_csvs=TRUE) +# - Plots: save_leapr_plots() writes pathway barplots to figs/pathways_*.pdf +# +# Returns +# - Named list by drug: res_list[[drug]]$top and res_list[[drug]]$bottom (leapR result tables) +# --------------------------------------------------------------------------- + suppressPackageStartupMessages({ library(dplyr) @@ -219,19 +254,21 @@ long_to_matrix <- function(df_long, sample_col, feature_col, value_col) { # ----------------------------- # Main # ----------------------------- -run_leapr_directional_one_cached <- function(drugs, - df_long, - sample_col, - feature_col, - value_col, - omic_label, - cache_path, - write_csvs = FALSE, - always_rerun = FALSE, - min_features = 5, - test_one = FALSE, - geneset_name = NULL, - geneset_object = NULL) { +run_leapr_directional_one_cached <- function( + drugs, # Character vector of drugs to test (IDs/names used by your fits/model) + df_long, # Long-format omics table (one row per sample x feature) + sample_col, # Column name in df_long containing sample IDs + feature_col, # Column name in df_long containing feature IDs (e.g., gene/site) + value_col, # Column name in df_long containing numeric values to analyze + omic_label, # Short label for this modality (used in logs/output names), e.g. "RNA" + cache_path, # File path to cache (read/write) computed results + write_csvs = FALSE,# If TRUE, write result/intermediate CSVs to disk + always_rerun = FALSE,# If TRUE, ignore cache and recompute even if cache exists + min_features = 5, # Minimum # of features required to run; otherwise skip/return early + test_one = FALSE,# If TRUE, run a single test case (e.g., first drug) for debugging + geneset_name = NULL, # Optional geneset label (used for naming outputs/plot titles) + geneset_object = NULL # Optional geneset definition (e.g., character vector) to filter features + ) { # cache check! If the cached value exists, stop there. if (!always_rerun && is.character(cache_path) && nzchar(cache_path) && file.exists(cache_path)) { @@ -431,37 +468,87 @@ run_leapr_directional_one_cached <- function(drugs, # ----------------------------- # Plot and save using leapR builtin plotter # ----------------------------- -save_leapr_plots <- function(res_list, omic_label, top_n = 15) { +save_leapr_plots <- function( + res_list, + omic_label, + top_n = 15, + drugs = NULL, # NULL = plot all drugs in res_list; otherwise character vector of drug IDs/names (case-insensitive) + outdir = "figs" # output directory for PDFs +) { if (!length(res_list)) return(invisible(NULL)) - dir.create("figs", showWarnings = FALSE) + dir.create(outdir, showWarnings = FALSE, recursive = TRUE) + safelabel <- function(x) gsub("[^A-Za-z0-9_.-]", "_", x) - for (drug in names(res_list)) { + all_drugs <- names(res_list) + if (!length(all_drugs)) { + message("[save_leapr_plots] res_list has no named drug entries.") + return(invisible(NULL)) + } + + # Case-insensitive lookup table: UPPER(drug) -> original name in res_list + key_upper <- toupper(all_drugs) + drug_map <- stats::setNames(all_drugs, key_upper) + + # Decide which drugs to plot + if (is.null(drugs)) { + plot_drugs <- all_drugs + } else { + req <- as.character(drugs) + req_upper <- toupper(req) + + found_upper <- intersect(req_upper, names(drug_map)) + plot_drugs <- unname(drug_map[found_upper]) + + missing_upper <- setdiff(req_upper, names(drug_map)) + if (length(missing_upper)) { + missing_original <- req[req_upper %in% missing_upper] + message("[save_leapr_plots] Skipping ", length(missing_original), + " requested drug(s) not present in res_list (case-insensitive match): ", + paste(utils::head(missing_original, 10), collapse = ", "), + if (length(missing_original) > 10) paste0(" ... +", length(missing_original) - 10, " more") else "") + } + } + + if (!length(plot_drugs)) { + message("[save_leapr_plots] No matching drugs to plot.") + return(invisible(NULL)) + } + + for (drug in plot_drugs) { two <- res_list[[drug]] + if (is.null(two)) next # TOP (resistant) if (!is.null(two$top)) { - p_top <- leapR::plot_leapr_bar(two$top, - title = paste0(drug, " — ", omic_label, " (Resistant)"), - top_n = top_n) + p_top <- leapR::plot_leapr_bar( + two$top, + title = paste0(drug, " — ", omic_label, " (Resistant)"), + top_n = top_n + ) if (!is.null(p_top)) { - fn <- file.path("figs", paste0("pathways_", safelabel(drug), "_", omic_label, - "_resistant_top", top_n, ".pdf")) + fn <- file.path(outdir, paste0( + "pathways_", safelabel(drug), "_", omic_label, "_resistant_top", top_n, ".pdf" + )) ggplot2::ggsave(fn, p_top, width = 7, height = 5, device = grDevices::cairo_pdf) } } # BOTTOM (Sensitive) if (!is.null(two$bottom)) { - p_bot <- leapR::plot_leapr_bar(two$bottom, - title = paste0(drug, " — ", omic_label, " (Sensitive)"), - top_n = top_n) + p_bot <- leapR::plot_leapr_bar( + two$bottom, + title = paste0(drug, " — ", omic_label, " (Sensitive)"), + top_n = top_n + ) if (!is.null(p_bot)) { - fn <- file.path("figs", paste0("pathways_", safelabel(drug), "_", omic_label, - "_sensitive_top", top_n, ".pdf")) + fn <- file.path(outdir, paste0( + "pathways_", safelabel(drug), "_", omic_label, "_sensitive_top", top_n, ".pdf" + )) ggplot2::ggsave(fn, p_bot, width = 7, height = 5, device = grDevices::cairo_pdf) } } } + invisible(NULL) } diff --git a/README.md b/README.md index 69f7152..099bb0f 100644 --- a/README.md +++ b/README.md @@ -7,15 +7,3 @@ The data for this code is hosted on Synapse at http://synapse.org/cnfDrugRespons ## Current analysis We are collecting omics measurements from cNF organoid samples together with drug response data to identify potential biomarkers of drug response. -## Quick run order (and what each file sources) -1) **01_harmonize_drug_data.Rmd** - - *Sources:* `cNF_helper_code.R` - - Download & quick QC of drug fits. - -2) **02_run_normalize_omics.Rmd** - - *Sources:* `cNF_helper_code.R`, `02_normalize_batchcorrect_omics.R` - - Runs per-modality normalization and ComBat; writes long tables/plots if enabled. - -3) **04_analyze_modality_and_pathway_enrich.Rmd** - - *Sources:* `cNF_helper_code.R`, `03_analyze_modality_correlations.R`, `04_leapr_biomarker.R` - - Builds drug/feature matrices, modality correlations, leapR pathway enrichment and plots for all. diff --git a/02_run_normalize_omics.Rmd b/analysis/01_run_normalize_omics.Rmd similarity index 53% rename from 02_run_normalize_omics.Rmd rename to analysis/01_run_normalize_omics.Rmd index 1dbe95e..febb395 100644 --- a/02_run_normalize_omics.Rmd +++ b/analysis/01_run_normalize_omics.Rmd @@ -9,16 +9,14 @@ output: html_document ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) library(synapser) -synLogin() syn <- list(get = synapser::synGet, store = synapser::synStore) - -# Load helper metadata (your cNF_helper_code.R defines 'meta' and 'pcols') -source("cNF_helper_code.R") +# Load helper metadata (00_cNF_helper_code.R defines 'meta' and 'pcols') +source("00_cNF_helper_code.R") # Source the pipeline -source("02_normalize_batchcorrect_omics.R") +source("01_normalize_batchcorrect_omics.R") ``` @@ -27,51 +25,43 @@ source("02_normalize_batchcorrect_omics.R") ```{r} # --------------------------------------------------------------------------- -# run_modality() — quick reference for args & expected batch structure +# run_modality() — quick reference # --------------------------------------------------------------------------- -# Args: -# -# - modality : "global" | "phospho" | "rna" (case-insensitive). -# - batches : list of cohort specs; each has: -# syn_id, cohort, fname_aliquot_index, (optional) value_start_col -# (auto-detects; fallback = 5 for global/phospho, 3 for rna). -# - meta : data.frame joined by (aliquot, cohort). Uses Specimen/Patient/Tumor -# if present. For RNA, falls back to normalized Specimen matching. -# - syn : synapser client (e.g., syn <- synLogin()). -# - drop_name_substrings : character vector; OR-regex to drop sample cols (NULL = keep all). -# - out_dir : output dir (auto-created). -# - out_prefix: filename stem (used unless save_basename set). -# - upload_parent_id : Synapse folder ID for uploads (NULL = no upload). -# - pcols : named colors for Patient in PCA (optional). -# - write_outputs : TRUE = write CSV/PDF (+upload if parent set); FALSE = in-memory only. -# - save_basename : override file stem (else out_prefix). -# - do_batch_correct : TRUE = ComBat by cohort; FALSE = skip (adds *_noBatchCorrect). +# Inputs +# - modality: "global", "phospho", or "rna" +# - batches : list of batch configs, e.g. +# list( +# list(syn_id="syn...", cohort=1, value_start_col=5, fname_aliquot_index=8), +# list(syn_id="syn...", cohort=2, value_start_col=5, fname_aliquot_index=9) +# ) +# Required per batch: +# * syn_id : Synapse file ID for the wide feature×sample table +# * cohort : cohort/batch label (used for joining + ComBat) +# * fname_aliquot_index: token index (split on "_") used to parse aliquot from sample headers +# Optional per batch: +# * value_start_col: first sample column index (auto-detected if omitted) +# - meta: sample metadata joined by (aliquot, cohort); used to populate Specimen/Patient/Tumor +# - syn: synapser client # -# Per-modality normalization -# - phospho : 0 toNA to drop >50% missing to log2(x+0.01) to per-sample modified z. -# - global : log2(x) to per-sample modified z. -# - rna : drop >50% missing to log2(TPM+1) to per-sample modified z. +# Options +# - drop_name_substrings: regex pattern(s) to drop unwanted sample columns +# - out_dir / out_prefix / save_basename: control output file locations/names +# - write_outputs: write CSV/PDF (+upload) vs return in-memory only +# - upload_parent_id: Synapse folder/project ID for uploads +# - pcols: optional named colors for PCA (Patient -> color) +# - do_batch_correct: run ComBat (TRUE) or skip (FALSE) # -# Returns (list) -# - se_batches : per-batch normalized SEs. -# - se_combined: intersection-features combined SE. -# - se_corrected: post-ComBat SE (NULL if do_batch_correct=FALSE). -# - se_post : SE used “post” (se_corrected or se_combined). -# - did_combat: TRUE/FALSE. -# - long_pre : long table from se_combined (finite only). -# - long_post : long table from se_post. -# - pca_df_pre, pca_df_post : PCA inputs (complete-case features). -# - plots : pre_pca, pre_hist, pca, hist (ggplot). -# - files : if write_outputs=TRUE, $queued = written file paths. +# What it does +# - Normalizes each batch (per modality), combines batches on shared features, +# optionally runs ComBat by cohort, then exports long tables + PCA/hist plots. # -# Notes -# - ComBat drops samples with NA cohort; requires colData(se)$cohort. -# - Global: splits multi-symbol Genes by ';'. RNA aliquot may be NA (Specimen fallback). -# - Sample headers auto-detected if path-like (*.raw, *.mzML, paths). +# Returns +# - SummarizedExperiments: se_batches, se_combined, se_post (and se_corrected if ComBat ran) +# - Long tables: long_pre, long_post +# - PCA data + plots: pca_df_pre/post, plots # --------------------------------------------------------------------------- - # Substrings to drop (These were the protocol optimization samples) drop_subs <- c( "cNF_organoid_DIA_G_02_11Feb25", @@ -82,29 +72,33 @@ drop_subs <- c( "cNF_organoid_DIA_P_06_11Feb25" ) -# PHOSPHO BATCHES -phospho_batches <- list( - list(syn_id = "syn69963552", cohort = 1, value_start_col = 5, fname_aliquot_index = 8), - list(syn_id = "syn69947351", cohort = 2, value_start_col = 5, fname_aliquot_index = 9) +``` + +# RNA +```{r} +rna_batches <- list( + list(syn_id = "syn66352931", cohort = 1, value_start_col = 5, fname_aliquot_index = 6), + list(syn_id = "syn70765053", cohort = 2, value_start_col = 5, fname_aliquot_index = 7) ) -# Run -phospho <- run_modality( - modality = "Phospho", - batches = phospho_batches, +rna <- run_modality( + modality = "rna", + batches = rna_batches, meta = meta, syn = syn, drop_name_substrings = drop_subs, - out_dir = "phospho_test", - out_prefix = "phospho", - upload_parent_id = "syn70078365", + out_dir = "rna_test", + out_prefix = "rna", + upload_parent_id = "syn71099587", pcols = pcols, write_outputs = FALSE, - save_basename = "phospho_batch12_corrected", - do_batch_correct = TRUE + save_basename = "RNA_12_no_batch_correct", + do_batch_correct = FALSE #Note this is set to false right now. Doesn't appear to be needed for RNA ) +``` -# GLOBAL BATCHES +# Global proteomics +```{r} global_batches <- list( list(syn_id = "syn69947355", cohort = 1, value_start_col = 5, fname_aliquot_index = 6), list(syn_id = "syn69947352", cohort = 2, value_start_col = 5, fname_aliquot_index = 7) @@ -126,25 +120,29 @@ global <- run_modality( ) -# RNA BATCHES -rna_batches <- list( - list(syn_id = "syn66352931", cohort = 1, value_start_col = 5, fname_aliquot_index = 6), - list(syn_id = "syn70765053", cohort = 2, value_start_col = 5, fname_aliquot_index = 7) +``` + +#Phospho Proteomics +```{r} +phospho_batches <- list( + list(syn_id = "syn69963552", cohort = 1, value_start_col = 5, fname_aliquot_index = 8), + list(syn_id = "syn69947351", cohort = 2, value_start_col = 5, fname_aliquot_index = 9) ) -rna <- run_modality( - modality = "rna", - batches = rna_batches, +phospho <- run_modality( + modality = "Phospho", + batches = phospho_batches, meta = meta, syn = syn, drop_name_substrings = drop_subs, - out_dir = "rna_test", - out_prefix = "rna", - upload_parent_id = "syn71099587", + out_dir = "phospho_test", + out_prefix = "phospho", + upload_parent_id = "syn70078365", pcols = pcols, write_outputs = FALSE, - save_basename = "RNA_12_no_batch_correct", - do_batch_correct = FALSE #Note this is set to false right now. Not needed for RNA + save_basename = "phospho_batch12_corrected", + do_batch_correct = TRUE ) ``` + diff --git a/analysis/02_analyze_modality.Rmd b/analysis/02_analyze_modality.Rmd new file mode 100644 index 0000000..6b4141a --- /dev/null +++ b/analysis/02_analyze_modality.Rmd @@ -0,0 +1,78 @@ +--- +title: "run_analyze_modality_and_enrichment" +author: "JJ" +date: "2025-11-12" +output: html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) + + +source("00_cNF_helper_code.R") +source("02_analyze_modality_correlations.R") +``` + +# Load in Data +```{r} + +#Drugs +drugs <- readr::read_tsv(synGet("syn69947322")$path) + +#RNA +rlong <- readr::read_csv(synGet('syn71333780')$path) + +#Global +glong <- readr::read_csv(synGet('syn70078416')$path) + +#Phospho +plong <- readr::read_csv(synGet('syn70078415')$path) + +``` + + +# RNA +```{r} +corr_rna <- analyze_modality( + fits = drugs, + df_long = rlong, + sample_col = "Specimen", + feature_col = "feature_id", + value_col = "correctedAbundance", + metric = "uM_viability", + heatmap_filename = "rna_drug_heatmap_large_viability.pdf", + outdir = "new_figs" +) + +``` + + +# Global Proteomics +```{r} +corr_global <- analyze_modality( + fits = drugs, + df_long = glong, + sample_col = "Specimen", + feature_col = "Gene", + value_col = "correctedAbundance", + metric = "uM_viability", + heatmap_filename = "global_drug_heatmap_large_viability.pdf", + outdir = "new_figs" +) +``` + + +# Phospho Proteomics +```{r} +corr_phospho <- analyze_modality( + fits = drugs, + df_long = plong, + sample_col = "Specimen", + feature_col = "site", + value_col = "correctedAbundance", + metric = "uM_viability", + heatmap_filename = "phospho_drug_heatmap_large_viability.pdf", + outdir = "new_figs" +) + +``` diff --git a/04_analyze_modality_and_pathway_enrich.Rmd b/analysis/03_pathway_enrichment.Rmd similarity index 56% rename from 04_analyze_modality_and_pathway_enrich.Rmd rename to analysis/03_pathway_enrichment.Rmd index c55b038..ddf33d4 100644 --- a/04_analyze_modality_and_pathway_enrich.Rmd +++ b/analysis/03_pathway_enrichment.Rmd @@ -8,62 +8,28 @@ output: html_document ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) - -source("cNF_helper_code.R") -source("03_analyze_modality_correlations.R") -source("04_leapr_biomarker.R") +source("00_cNF_helper_code.R") +source("03_leapr_biomarker.R") ``` - +# Load in Data ```{r} - +#Drugs drugs <- readr::read_tsv(synGet("syn69947322")$path) #RNA rlong <- readr::read_csv(synGet('syn71333780')$path) + #Global glong <- readr::read_csv(synGet('syn70078416')$path) + #Phospho plong <- readr::read_csv(synGet('syn70078415')$path) -corr_rna <- analyze_modality( - fits = drugs, - df_long = rlong, - sample_col = "Specimen", - feature_col = "feature_id", - value_col = "correctedAbundance", - metric = "uM_viability", - heatmap_filename = "rna_drug_heatmap_large_viability.pdf", - outdir = "new_figs" -) - -corr_global <- analyze_modality( - fits = drugs, - df_long = glong, - sample_col = "Specimen", - feature_col = "Gene", - value_col = "correctedAbundance", - metric = "uM_viability", - heatmap_filename = "global_drug_heatmap_large_viability.pdf", - outdir = "new_figs" -) - -corr_phospho <- analyze_modality( - fits = drugs, # syn69947322 TSV already read - df_long = plong, # your RNA or proteo/phospho long table - sample_col = "Specimen", - feature_col = "site", - value_col = "correctedAbundance", - metric = "uM_viability", - heatmap_filename = "phospho_drug_heatmap_large_viability.pdf", - outdir = "new_figs" -) - ``` - +# RNA ```{r warning=FALSE} - res_bio_rna <- run_leapr_directional_one_cached( drugs = drugs, df_long = rlong, @@ -71,13 +37,20 @@ res_bio_rna <- run_leapr_directional_one_cached( feature_col = "feature_id", value_col = "correctedAbundance", omic_label = "rna", - cache_path = "leapR_RNA_krbpaths_enrichment_direction_split.Rdata", + cache_path = "leapR_RNA_krbpaths_enrichment_direction_split_test.Rdata", write_csvs = TRUE, always_rerun = FALSE, test_one = FALSE, geneset_name = "krbpaths" ) +# Save plots to figs/pathways___{resistant|sensitive}_{top_n}.pdf +# save_leapr_plots(res_bio_rna, omic_label = "rna", drugs = c("Mirdametinib", "olaparib"), top_n = 15) + +``` + +# Global Proteomics +```{r warning=FALSE} res_bio_global <- run_leapr_directional_one_cached( drugs = drugs, @@ -86,14 +59,19 @@ res_bio_global <- run_leapr_directional_one_cached( feature_col = "Gene", value_col = "correctedAbundance", omic_label = "global", - cache_path = "leapR_Global_krbpaths_enrichment_direction_split.Rdata", + cache_path = "leapR_Global_krbpaths_enrichment_direction_split_test.Rdata", write_csvs = TRUE, always_rerun = FALSE, test_one = FALSE, geneset_name = "krbpaths" ) +# Save plots to figs/pathways___{resistant|sensitive}_{top_n}.pdf +# save_leapr_plots(res_bio_global, omic_label = "global", drugs = c("Mirdametinib", "olaparib"), top_n = 15) +``` +# Phospho Proteomics +```{r warning=FALSE} res_bio_phospho <- run_leapr_directional_one_cached( drugs = drugs, df_long = plong, @@ -101,14 +79,13 @@ res_bio_phospho <- run_leapr_directional_one_cached( feature_col = "site", value_col = "correctedAbundance", omic_label = "phospho", - cache_path = "leapR_Phospho_kinasesubstrates_enrichment_direction_split.Rdata", + cache_path = "leapR_Phospho_kinasesubstrates_enrichment_direction_split_test.Rdata", write_csvs = TRUE, always_rerun = FALSE, test_one = FALSE, geneset_name = "kinasesubstrates" ) - - - +# Save plots to figs/pathways___{resistant|sensitive}_{top_n}.pdf +# save_leapr_plots(res_bio_phospho, omic_label = "phospho", drugs = c("Mirdametinib", "olaparib"), top_n = 15) ``` diff --git a/analysis/info.md b/analysis/info.md new file mode 100644 index 0000000..530852e --- /dev/null +++ b/analysis/info.md @@ -0,0 +1,489 @@ +## Quick run order (and what each file sources) + +1) **01_run_normalize_omics.Rmd** + - *Sources:* `cNF_helper_code.R`, `01_normalize_batchcorrect_omics.R` + - Runs per-modality normalization and ComBat; writes long tables/plots if enabled. + +2) **02_analyze_modality.Rmd** + - *Sources:* `cNF_helper_code.R`, `02_analyze_modality_correlations.R` + - Builds drug/feature matrices, modality correlations. + +3) **03_pathway_enrichment.Rmd** + - *Sources:* `cNF_helper_code.R`, `03_leapr_biomarker.R` + - LeapR pathway enrichment and plots for all. + + + +# Normalized Global / Phosphoproteomics Data for Clustering via PCA + +The datasets analyzed in this study included both global proteomics and phosphoproteomics +measurements collected from two experimental cohorts of cNF (cutaneous neurofibroma) organoid +samples. Each cohort comprised multiple tumor specimens derived from different patients. + +For each dataset (global proteins and phosphosites), raw intensity values were preprocessed to +ensure comparability across cohorts: + +- **Missing values and zeros:** Zero measurements were replaced with missing values (NA) to prevent skew during normalization. +- **Feature filtering:** Proteins or phosphosites absent in more than 50% of samples were removed to reduce noise and improve robustness. +- **Normalization:** Remaining values were log2-transformed with a small offset (to prevent log(0)/undefined) and scaled using a modified z-score (median centering and median absolute deviation scaling). This approach places samples on a common scale while down-weighting outliers. +- **Cohort separation:** Each cohort was processed independently to account for technical differences, then merged into a single combined dataset. +- **Batch correction:** To address systematic differences between cohorts, ComBat (sva R package) was used, producing batch-adjusted abundance matrices for both global proteomics and phosphoproteomics. + +The resulting normalized and corrected data provided a harmonized view of protein and phosphosite +abundance across all patients and cohorts, enabling joint exploratory analysis such as PCA (Figures: +phosphoCorrectedPCA.pdf and globalCorrectedPCA.pdf). + +#### Figure: `phosphoCorrectedPCA.pdf` + +This figure shows the results of a principal component analysis (PCA) performed on the +phosphoproteomics data after normalization and batch correction using ComBat. Prior to correction, +samples clustered primarily by cohort, reflecting a strong batch effect. After adjustment, the major +sources of variation correspond more closely to biological factors, such as patient identity and +tumor replicate. Each point represents an individual sample, colored by patient and shaped by tumor +designation, enabling visualization of patient-specific clustering patterns. The correction +substantially reduced separation by cohort, suggesting that technical batch effects were effectively +mitigated and that the remaining signal more likely reflects underlying biological differences in +phosphoproteome profiles. + +#### Figure: `globalCorrectedPCA.pdf` + +This figure presents a PCA of the global proteomics dataset following the same normalization and +ComBat batch correction procedure. Similar to the phosphoproteomics analysis, initial clustering was +dominated by cohort effects; however, correction aligned the data so that patient identity, rather +than cohort, explained the primary axes of variation. Each sample is again colored by patient and +shaped by tumor replicate, allowing assessment of within-patient reproducibility and between-patient +divergence. The improved alignment indicates that batch effects were successfully minimized, and the +corrected data provide a more reliable basis for downstream analyses of patient- and tumor-specific +proteomic signatures. + +## Biomarker Evaluation using Batch-Corrected Global / Phosphoproteomics Data + +For this biomarker evaluation analysis, we brought together four different data types collected on +the same patient-derived samples: + +- **Drug sensitivity measurements:** single-dose viability values (and some full dose–response curves) across a large panel of compounds. For drugs with multiple doses enabling curve fitting, dose-response fit_auc (area under the fitted viability curve; lower AUC = higher sensitivity) was also analyzed. +- **RNA sequencing:** transcript abundance data (to be harmonized into long format in subsequent steps). +- **Global proteomics:** normalized protein abundance values, batch-corrected across cohorts. +- **Phosphoproteomics:** normalized and batch-corrected site-specific phosphorylation abundances. + +To enable cross-modality analyses, all datasets were filtered to retain only those specimens present +in both the drug response data and at least one molecular modality. This produced a shared set of +samples suitable for correlation and biomarker discovery analyses. + +- **Proteomics:** Both global and phosphoproteomic data were restructured into sample-by-feature matrices, where rows represent specimens and columns represent either proteins (global) or phosphosites (phospho). Abundance values correspond to batch-corrected, log-scaled measurements. +- **Drug sensitivity:** Viability data (expressed in relative units of surviving fraction at given doses) were reformatted into a specimen-by-drug matrix, with average values across replicates used where multiple measurements existed. Separately, a fit_auc specimen×drug matrix was constructed by averaging per-drug fit_auc across replicates, retaining only drugs present in syn69947322. +- **Filtering:** Only drugs with complete measurements across all available specimens were retained for certain analyses, yielding a consistent comparison set ("full drugs"). For fit_auc, additional coverage summaries (mean, SD, n specimens per drug) were calculated to support ranking and visualization. +- **Drug Count:** 238 + +
+Drug list + +- Abemaciclib +- ABT-737 +- Adagrasib +- Adavosertib +- Afatinib +- Alectinib +- Alisertib +- Alpelisib +- Alvespimycin +- Alvocidib +- Anlotinib +- Apitolisib +- ARS-1620 +- Avapritinib +- AVL-292 +- Avutometinib +- Axitinib +- Barasertib +- Batoprotafib +- Belinostat +- Belzutifan +- BI-3406 +- BI-847325 +- BI-D1870 +- Bimiralisib +- Binimetinib +- Birinapant +- BLU9931 +- BMS-536924 +- Bosutinib +- Brigatinib +- Brivanib +- Brukinsa +- Brustaol +- Cabozantinib +- Calquence +- Capivasertib +- Capmatinib +- CBL0137 +- CCT-018159 +- Cedazuridine +- Ceralasertib +- Cerdulatinib +- Ceritinib +- CFT8634 +- Cobimetinib +- Copanlisib +- CPI-613 +- Crenolanib +- Crizotinib +- Dabrafenib +- Dacomitinib +- Danusertib +- Daporinad +- Daprodustat +- Dasatinib +- Defactinib +- Derazantinib +- Digoxin +- Dinaciclib +- Dovitinib +- Doxorubicin +- Duvelisib +- Eganelisib +- Elimusertib +- Enasidenib +- Encorafenib +- Enitociclib +- Ensartinib +- Entinostat +- Entrectinib +- Enzastaurin +- Everolimus +- Fadraciclib +- Famotidine +- Fedratinib +- Fimepinostat +- Fisogatinib +- FRAX597 +- Futibatinib +- Galunisertib +- Ganetespib +- Gefitinib +- Geldanamycin +- Gilteritinib +- Glasdegib +- GNE-617 +- GSK2256098 +- H 89 2HCl +- Ibrutinib +- Idasanutlin +- IKK-16 +- Imatinib +- INCB188053 +- INCB191856 +- Infigratinib +- Ipatasertib +- Ivosidenib +- JNJ-7706621 +- Ketotifen +- KO-947 +- KW-2478 +- Lapatinib +- Larotrectinib +- Lenvatinib +- Lestaurtinib +- Letrozole +- Lifirafenib +- Linsitinib +- Lorlatinib +- Losartan +- Losmapimod +- Lovastatin +- Luminespib +- LY231514 +- LY3023414 +- Manumycin A +- Merestinib +- Metformin +- Methotrexate +- Midostaurin +- Mirdametinib +- MK-2206 +- ML 210 +- MLN2480 +- Molibresib +- Napabucasin +- Navitoclax +- Nedisertib +- Nedometinib +- Neratinib +- Nilotinib +- Nintedanib +- Nirogacestat +- NT219 +- NVP-BEZ235 +- Octreotide +- Olaparib +- Olutasidenib +- Omipalisib +- Onalespib +- Onametostat +- Onvansertib +- Osimertinib +- Pacritinib +- Palbociclib +- Pan-RAS-IN-1 +- Panobinostat +- Pazopanib +- PCNA-I1 +- Pelabresib +- Pemigatinib +- Pemrametostat +- PF-562271 +- Pimozide +- Pirtobrutinib +- PLX-3397 +- Ponatinib +- Pralsetinib +- Prexasertib +- PT2385 +- Quizartinib +- Ranitidine +- Rapamycin +- Ravoxertinib +- Regorafenib +- Repotrectinib +- Retaspimycin +- Ribociclib +- Ripretinib +- RMC-6236 +- RO4929097 +- Rogaratinib +- Romidepsin +- Roxadustat +- Rucaparib +- Ruxolitinib +- Sapanisertib +- SAR405838 +- Seclidemstat +- Selinexor +- Selpercatinib +- Selumetinib +- Sertraline +- Siremadlin +- Sitravatinib +- SNX-2112 +- SNX-5422 +- Sonidegib +- Sorafenib +- Sotatercept +- Sotorasib +- Subasumstat +- Sulfasalazine +- Sunitinib +- Surufatinib +- SW106065 +- Tadalafil +- TAE226 +- TAK-243 +- Tanespimycin +- Tazemetostat +- TED-347 +- Tegavivint +- Telaglenastat +- Temozolomide +- Temsirolimus +- Tepotinib +- THZ1 +- Ticlopidine +- Tipifarnib +- Tivantinib +- Tivozanib +- TK216 +- Tofacitinib +- Tomivosertib +- Tovorafenib +- Tozasertib +- Trametinib +- Tretinoin +- Trilaciclib +- Triptolide +- Ulixertinib +- Umbralisib +- UNC2025 +- Unesbulin +- Vactosertib +- Vandetanib +- Vemurafenib +- Venetoclax +- Vismodegib +- Vistusertib +- Vociprotafib +- Volasertib +- Vorinostat +- VS-6766 +- Y-27632 +- Zanzalintinib +
+ +## Exploratory Analysis of Drug Responses + +We performed initial exploration of the drug dataset to understand variability and efficacy across +the compound panel. + +### Correlation of molecular features with drug response + +Spearman correlations were computed between every drug (response profile) and every molecular +feature (protein or phosphosite). + +Significance values were estimated via permutation-based correlation tests, with multiple testing +correction applied (FDR). + +Features were separated into positive correlations (higher abundance/phosphorylation associated with +higher viability, i.e. resistance) and negative correlations (higher abundance/phosphorylation +associated with lower viability, i.e. sensitivity). + +#### Figure: `most_efficacious_.png/pdf` + +This scatterplot highlights the subset of drugs that were most efficacious across the patient +samples, defined as compounds with an average cell viability below 0.5 (i.e., less than 50% +survival). Each point represents a drug, positioned by its mean viability (y-axis) and labeled along +the x-axis. Point size reflects the variability in response across specimens, while point color +indicates the number of samples tested. Several compounds demonstrate both strong overall activity +and consistent performance across patients, suggesting broad-spectrum effectiveness. + +#### Figure: `most_variable_.png/pdf` + +This scatterplot highlights drugs with the highest variability in response across specimens, defined +as those with a standard deviation greater than 0.15. Here, each point again represents a drug, with +mean viability on the y-axis and drug identity on the x-axis. Point size encodes variability, and +color denotes the number of samples measured. Unlike the most efficacious plot, these compounds are +not necessarily the most potent but instead show strong heterogeneity between patients. Such drugs +may provide the greatest opportunity for biomarker discovery, as differences in response are more +likely to be explained by underlying molecular features. + +#### Figure: `_drug_heatmap_large_viability.pdf` + +This heatmap displays drug response values (viability) for the subset of drugs measured consistently +across all specimens ("full drugs"). Rows correspond to patient samples and columns to drugs. +Hierarchical clustering of both rows and columns highlights patterns of similarity, revealing groups +of patients with shared sensitivity profiles as well as clusters of compounds with correlated +activity. The visualization provides a global overview of drug response heterogeneity across the +cohort, serving as a baseline reference for linking molecular features to therapeutic sensitivity. + +#### Figure: `_cor_features_by_drug.pdf` + +Bar charts summarizing, for each drug, the number of molecular features (proteins or phosphosites) +that correlate with response at FDR < 0.25, separated by direction (features associated with +resistance vs sensitivity) and faceted by data modality. These counts provide a quick sense of which +drugs show the richest correlational signal. + +**Additional fit_auc figures:** +- Figure: most_sensitive_auc.pdf — scatter plot of drugs ranked by mean fit_auc (y-axis); point size = SD of fit_auc across specimens; color = number of specimens. +- Figure: most_variable_auc.pdf — scatter plot of drugs ranked by SD of fit_auc (y-axis); point size = number of specimens. + +### Functional enrichment of correlated features + +Using the leapR package, correlated features were ranked and tested for enrichment against gene set +collections.. + +Enrichment was conducted in a direction-aware manner: + +"Top" (positively correlated features) highlight pathways enriched among resistance-associated +features. + +"Bottom" (negatively correlated features, flipped in rank) highlight pathways enriched among +sensitivity-associated features. + +Results were summarized at multiple levels: + +- **Pathways per drug:** Number of significant pathways identified for each drug, separated by resistant vs sensitive associations. +- **Pathways across drugs:** Top 15 most recurrent pathways across the full drug panel, faceted by proteome vs phosphoproteome. +- **Drug-specific profiles:** Bar charts of the top 15 pathways per drug and modality, annotated with significance stars (* <0.05, ** <0.01, *** <0.001) + +#### Figure: `pathways_across_drugs_top15.pdf` + +Pathway-level summary of enrichment results across the full drug panel. Bars show the top 15 +pathways (per modality) most frequently enriched among significant feature–response correlations +(FDR < 0.05), with direction indicating whether pathways are associated with relative resistance +(features positively correlated with viability) or sensitivity (features negatively correlated with +viability). This highlights recurrent biological programs linked to drug response. + +Generalized figures: + +Generalized pathway enrichment figures were generated for the two most efficacious drugs, the two +most variable drugs, and Onalespib. + +#### Figure: `pathways_____top15.pdf` + +Each pathway figure (pathways_____top15.pdf) displays the top +15 enriched pathways whose molecular features are most strongly associated with the drug response +profile for . The field indicates whether enrichment was performed on global +proteomics (global) or phosphoproteomics (phospho) data. The field specifies whether the +pathways are associated with relative resistance (resistant, features positively correlated with +drug response) or relative sensitivity (sensitive, features negatively correlated with drug +response). The field denotes whether correlations were computed using single-dose +viability (viability) or dose-response fit_auc (fit_auc). In all cases, pathways are ranked by +−log10(FDR), and significance is annotated using standard thresholds (* <0.05, ** <0.01, *** +<0.001). These figures collectively summarize the biological programs whose protein abundance or +phosphorylation levels track with either enhanced or diminished drug effect across patient samples. + +Note on Onalespib (and other uniformly potent drugs): +Onalespib was one of the most efficacious compounds in the panel, but because nearly all samples +were highly sensitive, the variability across specimens was minimal. Correlation-based enrichment +relies on inter-patient heterogeneity; without it, even strong biological effects cannot be linked +robustly to specific pathways. This explains why Onalespib yields few significant pathways despite +its overall potency. + +## RNA-Seq / Differential Expression Analysis / Gene Set Enrichment + +Samples and preprocessing. +RNA-seq quantifications (Salmon gene-level quant.genes.sf) were retrieved for two conditions from +the same organoid line (NF0021-T1): + +1. untreated organoids +2. Onalespib (1 µM) treated organoids + +Raw read counts were assembled into a gene × sample matrix and filtered to retain genes with total +counts ≥ 10 across the two samples. Size-factor normalization was performed with DESeq2 to obtain +normalized counts. + +### Effect-size estimation (no replication) + +Because we have one sample per condition, formal differential testing is not performed. Instead, we +report exploratory effect sizes: per-gene log2 fold-change (log2FC) computed from normalized counts +as log2FC = log2((Onalespib + 1) / (Untreated + 1)). These values support ranking and visualization, +and hypothesis generation but should be interpreted as descriptive (no p-values). + +### Ranked list for enrichment + +Symbols were converted to ENTREZ ID; when multiple rows mapped to the same ENTREZ, we used the +median log2FC per ENTREZ for enrichment. + +Gene set enrichment analysis (GSEA). +We ran GSEA using clusterProfiler using: + +MSigDB Hallmark (H) via msigdbr (concise, non-redundant "meta-pathways"), + +GO Biological Process (BP) via gseGO + +Both dotplots display GeneRatio on the x-axis (fraction of the pathway's genes that are enriched) +and count as point size. + +#### Figure: `heatmap_top30_all_log2fc.pdf` + +Top 30 transcripts by |log2FC| (including unmapped IDs). +A clustered heatmap of the strongest responders (by absolute log2FC) using normalized counts +centered per gene. Labels use symbols when available, otherwise RefSeq base IDs. This heatmap +preserves completeness and has all of the true top changes. + +#### Figure: `heatmap_top30_mapped_log2fc.pdf` + +Same selection logic but restricted to symbol-mapped genes for the heatmap. This is much more +interpretable and cleaner to share but is technically missing the top differences. + +#### Figure: `gsea_dot_GSEA-MSigDB_Hallmark.pdf` + +Dotplot of MSigDB Hallmark GSEA results +This figure displays enriched Hallmark gene sets ranked by normalized enrichment score, with points +colored by enrichment direction, sized by the number of leading-edge genes, and positioned by +GeneRatio. + +#### Figure: `gsea_dot_GSEA-GO_BP_.pdf` + +Dotplot of GO Biological Processes results +The figure shows enriched GO BP terms from the ranked gene list, with points colored by enrichment +direction, sized by leading-edge gene count, and positioned by GeneRatio. + From f1c3819dd1b86b89c2c4d3dea6fb898a49487b1d Mon Sep 17 00:00:00 2001 From: Jeremy Date: Mon, 12 Jan 2026 15:26:59 -0800 Subject: [PATCH 09/13] Added documentation. Added analysis folder. Reordered scripts. Moved some to legacy. Added comments and cleaned things up. Moved the docx file to analysis readme --- 00_cNF_helper_code.R | 2 +- analysis/{info.md => README.md} | 25 ++++++++++++++---- cNF_Analysis_and_fig_descriptions.docx | Bin 27471 -> 0 bytes ..._and_Upload.py => cNF_Cohort_Annotation.py | 0 .../00_createCurveStats.py | 0 .../01_harmonize_drug_data.Rmd | 2 +- 6 files changed, 22 insertions(+), 7 deletions(-) rename analysis/{info.md => README.md} (94%) delete mode 100644 cNF_Analysis_and_fig_descriptions.docx rename cNF_Cohort_1_Process_and_Upload.py => cNF_Cohort_Annotation.py (100%) rename 00_createCurveStats.py => legacy_code/00_createCurveStats.py (100%) rename 01_harmonize_drug_data.Rmd => legacy_code/01_harmonize_drug_data.Rmd (97%) diff --git a/00_cNF_helper_code.R b/00_cNF_helper_code.R index 4f4eaf9..9ae26c5 100644 --- a/00_cNF_helper_code.R +++ b/00_cNF_helper_code.R @@ -3,7 +3,7 @@ library(synapser) -synLogin(authToken = "eyJ0eXAiOiJKV1QiLCJraWQiOiJXN05OOldMSlQ6SjVSSzpMN1RMOlQ3TDc6M1ZYNjpKRU9VOjY0NFI6VTNJWDo1S1oyOjdaQ0s6RlBUSCIsImFsZyI6IlJTMjU2In0.eyJhY2Nlc3MiOnsic2NvcGUiOlsidmlldyIsImRvd25sb2FkIl0sIm9pZGNfY2xhaW1zIjp7fX0sInRva2VuX3R5cGUiOiJQRVJTT05BTF9BQ0NFU1NfVE9LRU4iLCJpc3MiOiJodHRwczovL3JlcG8tcHJvZC5wcm9kLnNhZ2ViYXNlLm9yZy9hdXRoL3YxIiwiYXVkIjoiMCIsIm5iZiI6MTc2MTgzOTMwNCwiaWF0IjoxNzYxODM5MzA0LCJqdGkiOiIyNzg5OSIsInN1YiI6IjM0NTM5NTUifQ.OkzJaWKY7po87mP_iKDNP8ah--BsZpb-mH8U1wP2kj2yqnxi65yi1irAgNNITXB-mQ84ndpdV-9SfNCpkc-pUIenHADpLTMPbh2dMLAiCS77picR-bLlb79c7e005L-7EWuMBPh7RXAXFOcEwY3823Z-RsjKatGYZXCiGaC_I0OPuawh1c3y_c2VVsZUFMdfXJaQH4iaMB5wK9OA_OlI_mnwaYrL1C3ID6ZCuGdO8cR9-fwgu5NzZPq5OOyn11BSblb7NytwaEhhuteHediXBnhGSmfSCUCpQvdpppb03_-Q8DK21oyAgUSygn_h7QFIqtdfzyb8mg300D49al-yAg") +synLogin() syn <- list(get = synapser::synGet, store = synapser::synStore) library(readxl) library(tidyr) diff --git a/analysis/info.md b/analysis/README.md similarity index 94% rename from analysis/info.md rename to analysis/README.md index 530852e..6d6edf3 100644 --- a/analysis/info.md +++ b/analysis/README.md @@ -1,3 +1,19 @@ + + + +# cNF multi-omics analysis pipeline overview +Purpose: Reference for running the cNF analysis end-to-end and understanding what each stage produces. + +- Run order: Lists the recommended notebook/script sequence and what each file sources (dependencies). + +- Data processing: Summarizes cohort-wise preprocessing, normalization, and ComBat batch correction for RNA, global proteomics and phosphoproteomics. + +- Exploratory outputs: Describes the PCA figures generated from batch-corrected data (global + phospho). + +- Biomarker discovery: Outlines how drug response matrices and omics feature matrices are constructed, filtered to shared samples, and correlated. + +- Pathway interpretation: Summarizes direction-aware enrichment (leapR / GSEA) and the key plots produced across drugs, modalities, and metrics. + ## Quick run order (and what each file sources) 1) **01_run_normalize_omics.Rmd** @@ -13,8 +29,7 @@ - LeapR pathway enrichment and plots for all. - -# Normalized Global / Phosphoproteomics Data for Clustering via PCA +## Normalized Global / Phosphoproteomics Data for Clustering via PCA The datasets analyzed in this study included both global proteomics and phosphoproteomics measurements collected from two experimental cohorts of cNF (cutaneous neurofibroma) organoid @@ -56,7 +71,7 @@ divergence. The improved alignment indicates that batch effects were successfull corrected data provide a more reliable basis for downstream analyses of patient- and tumor-specific proteomic signatures. -## Biomarker Evaluation using Batch-Corrected Global / Phosphoproteomics Data +### Biomarker Evaluation using Batch-Corrected Global / Phosphoproteomics Data For this biomarker evaluation analysis, we brought together four different data types collected on the same patient-derived samples: @@ -318,7 +333,7 @@ samples suitable for correlation and biomarker discovery analyses. - Zanzalintinib -## Exploratory Analysis of Drug Responses +### Exploratory Analysis of Drug Responses We performed initial exploration of the drug dataset to understand variability and efficacy across the compound panel. @@ -427,7 +442,7 @@ relies on inter-patient heterogeneity; without it, even strong biological effect robustly to specific pathways. This explains why Onalespib yields few significant pathways despite its overall potency. -## RNA-Seq / Differential Expression Analysis / Gene Set Enrichment +### RNA-Seq / Differential Expression Analysis / Gene Set Enrichment Samples and preprocessing. RNA-seq quantifications (Salmon gene-level quant.genes.sf) were retrieved for two conditions from diff --git a/cNF_Analysis_and_fig_descriptions.docx b/cNF_Analysis_and_fig_descriptions.docx deleted file mode 100644 index 016a474dbadaf63777738bfd1972473b9fe079b6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 27471 zcmeFZ)0bw!)298D@sw@bwr$%syKJM&wyVqNvTd_VUAApc|K4wXYi8EWfACHoXl@E6IX^qX8fR&;S5{7{HR~sA&NL0F*!g0H^?HP+d_+2Ul|kS3@-~Cvz8l z22XoCq9SlmssaG$Kl}gJ{u|FgQ_7^xAQO`KbJ$nNe9Nj#2UW??cu~ALuGJez6q|zB zqeyYrSI<5da|szU+xQ64?1s-QG;_%O-z+;AoUpkxDUPTIAV1Ns0nY>M3! zrY+%djzBxMu5N5O4iF~N=E0+Rn$cvqIe}**$XllFkv{W$4>=B&aDi=_6{4Ozy(gUSD9OUOU zs^=VL15yU*FVIM+84(wBJVcECV)ad;vh{4Z`Yx3a#!!2nACn>+~;aayncjVSz6AMl?%4S$&1xiB*Pr~iMl{BNwq|K-)I zlluS35>DtkD)Tj8Wgmyv>fXCs?EyRC*SHSsQ#(3iIF@f(WDv{=ca4^%)7X^Z1`sw9ZVt2PsVp& z3q2H_uj8mAY(b7|V(liwT}Rv;ec}6mLF)h1N5B%{+$<9SpzIFjx=9*#fE7|k3_ z-R%Fxu>SmLMPA0@-=O;JIpnlLzM&JeI1>(hrwBNXd7{%`-yElf;KV{0LR4l33*vQz=FYf_u zSzZglVTrge*X}7`UXms~_*}JuUaj}%uXXcViAAe-_>(~2Cx_!Kao+EnFTwA!-d?^R zpuCyPf@wz3f;=5L>(}n3&U}JAd@&c)+1m5{Q_X`!up_rY_)`cWx2w%-yXHtd5tY?r zuIz|l+2Y6_dDgE$+P}zyYnqP|E+XeB1|uMkqCPfZ%$*aXN>_YR_jN9-1_V;wqe68e z>e>V{o%c#3#4|S5M4tv={ncp_Hd~Ir(4$$eIFmZ~RL=cLttUyFuQ?Y`c&5{k`7&4u zd+?xkr=oUExwvz*cUR1|0)vhiNV}BdVtw}}H>wTZ|TYx*7lTNW6&yY4DSOyL9 zO-oC{Y+aHS29RwznqBYziGO+DQ+$HFKl`Nn&^u1hA2vWMyAF3V8a2EIj)*An!ev4^ z!F^br&9GiEbzWQD9J1k?IimtFSQ-7Ue$@UUYj&%ynQbK|tebfl-kRnUxX+eUJHYAu zDQf2UCsX8=b1`!Y5OPo0gJ}iZr*6ZN?+S3U1`JxTBSyQ%v|#Pe%~aryI@>0x^B`=a z7@7iNyX;H zvUDV8_P=Z`fAgZTKGpN~Zl5wtmn}Lu;VWl&g@EqZ$`~Y%fV9Uez^FJ^^n>MKHgvEr zp44+B$`13Xoi{!NdNkMllj>d0G3KS$LM))z@w6|izc;=olS^rLtMDar&`xG|0z%-~ zmmz-d84vG@^GJ%Y%gD@Yq0p5DzUY+=VD?iRpoK2&N8|%TrOwh#7NrdhO-8RL%fsBs zr86~pK&!X)t*he-`wMOt2zizo@|6#uT&;OBt2FwDHN8J64G%5 zzW6m}qfIV23l(2%sS6tfU(nug_?8fBqyd+Ht%mrBh1%SqwAo;?B;T~q{#E>b+vPlP zmC{2frxUSdyQ}&WAsnqIwJwH26!Z?{j|sSw?J%fcT$v!6QAblF`B7w0{w|3MgU+L- z7)A1+Zj3o6!tf3y6%A|ix-}ElPK6ze9ZK9rPrzM%P6M2G!6~v_ciF-jOPH{NRCd5xYePnDTCKA}O7%weOAAl&UreG$ zvx7$~Or+q>$emwO3d8FAG+>y!jfyz-=I%(@h)UpAlSnd}pse^PsDJhO?uO}mT{GR{ zbGZ8*T_7V!wjB0P!qnRXBSFk0Fdckt#W+w#7~n2J?N^z=HGOXBolzsHZj!KESs*8v z27Zo&lszH*a#0(v=1zc~5OlvSXFnN2^Y7?3Ij<6o5?LX7#q6HevcU>t5#4G zEG8b(6={=poy_rosxHXRb+Dlkh<5o5VGcH!z`{4~JQcSCnK-z!i0s^DD5;F?JV; z-awOJSLzK8?XDa{`z7$GJXj@S3}l_&N~DfoEIceaOD`;w_W319!!WSCgT^yK$*@ul9y0W2#SD45JVgj)tz}&fTHJ2M$ ziGt5Rcg3ACU4{YY4@|BwUSm^@?0-Nwnt6fH9v__Uj|X7PNEm?9TtA=Pl$Kh!XK(J- zO=WY4o@|J1HQ_|rlx-&R9u(CIZIP_}`tf3Zhp*%bm0me(W_W zwgY}GgQ?Kib**!ISUOn|fPwMBTcq`Z3%}S&PR7sw{^uSCE?-l_{CVJeGy&4Hxe4?X zBy9^G{yUAM0&E19{LlIzYdMkef_Xd5(!5Q1qAT328mM|z6{j+jLbj?wJ2A#SbEV5Z&_+zk0!=rnkRLVj?_xmtuV zlwTLy5|dpGs*Ide93It1x{BVqz@?DkN?b4eYD){rLs&Q`SAi*^k`npj0zd*{WzA+- zAcK(MImoqi6PavSG5r8z%t$CPvRfni8Jvzas@KGvAc_dN^QHGt*(b}4`nAnH z-gts$X4N85j|q>)5V;5gH=W)SZ*T&@M3|I^qQoT(=b6_R=olpN*jwBHvfK~ECi!A2 z5gm)NjkWVZva1MT5QbmxOvKMkMJ&hK6&IO>-rm*&RoZ z(Aiom(Ky9cd0(MCf&z2fnN1q!SYA3qODYOemXB{YZ#I6v#I1TL+4x0-ZLyy=zFwiJ zn<VZ$*GvATi49~4cyaxb*2>*7(}Q^ zC$yznsXCp;dgwS(s%_hY_cC(ljjAtcrSIN^B8)DwWH?DEMEHfwx04<_VcoXGb<+i4 z6*}Qgh?r9}Or;4c@sVyLD~ksc^dZV^XI*KOYSE)_alndh`=;=`c`rNJ2F1P{%BnLM ztNzVq@gb0U^el|_Lpg;u%Krq*34kR4W0c92E@LYwUBF%k0aV7D7J|cN7JkMAO(pw= z2C*{_`k9FoqwIX6G)BszB8^Pnj`2Ecr72Yl<@H)g@)KyC89j*7kxzM}pxovJ4{_%HY1@_$x#yfuKWTA&~EnUYifO52Tsv15~iyxVSko>z-5OTp!ig z{%lcYYN4g{6GSXXchWw%xlJG+(JAM{l+!6^axbmB`;TK)=;&h;Z?Kpc9GniTxoLzB z#6}Adm;FFF@-IEq@v!e0$TlbW7j;RTaMeOf9EuZ z8Qv=i(NKf-Y?^~EcfP;Qp(u@!Vl>`cn^RS|}Jw@^EpuXDf4CdZDHRK07W==H9xN*O!|2G?J$ z&b>a{1^GGq9?ZDxA#cP6$RQkTNXq<4lVcqx;Vb*0m*W1Ovc1c|i|M!5=cbupy-vRa zrz<$65?5YBoZ>3%=<{@*pyx$(voA|!7Bdosw3UnD#Jp$t@05M{Er(O^jf*L2h|z-<_;8HXYm@a0A7e^wve0Q$<}WA1 zOABj1n!pWs$)cOK1_yw;%TAcfHy32e0{a9aM4wEVQpkt6?Z7`;QLlK85y;id9)91S z+hSZ6gD@oFubm>!qCL>2S4kR?NB)1B$b6xRP9xk`o4oZ&KfA5D43PDJPIuroRs05R2Gahu3cKPTJFo*@Z=xp_3@Y zcLp8@C}hOHKZr`fr&yRrT=6l8ktWjw!Dx_bz*&}dAtDybcp4MNQ1lf*2W!!ZGiOUS zi^#$^{)X3~x1WS*$uVN(4yyaPR5Nlbks(YxN-V2AmI1m#_9R{^ca=`PVpA^qC9^(< z)*WxwLHPYm;s67zCb~j6+4h)T>mW3#zhEn#Nwsk*>DKaPls4nr8E_X5lJ)4-a z>4_bQhy|%8aAjeY?it|}% z=>;hDyi_kLG*rXB1&7RywL5K`tkEmi0mN5;z21m>Fd=4Z49_VArVc8IR;r}FVks&{ z7Et-%y%OpzGI;BF6wcDkfmQfDc4|Rkgp^kgJkEv?h{*>0b7yRE^%(zXy-CJ?a~mMe zUyV6Wb%JS0G&U9((?t?j!hG8Y*zx-+mZSDH;S`FfJ*N8Gve31l`{BCS8V9TfMt{=CK#^LtRNGE=7^JwV^Vd4&2YsfskF*7) z=0}I&llS1Nj#1COgA#=tEzC8vE1}AwdX5|;OmwHLIlIuDX<1w2_je_1 zC;CcHUD+6_4S))dWtquIVHq`G85(SeISo&d>r^_9)#}l?s91=F(bGzO9_rJTgCgE! z)!^TAOtm#WpT$@nGhgEjeN8=P20sgabp!N%*sOs-B2xakj;n?Es|RJ^&?$)`9B;w=Tz44Mj+s-f>5-_Pbx^$|bR2rIhL0?>=$)%m-VC2{Y3#0`-2-zCb5 zTiMiW>c^%H6?0MlDzUZms?oM5L@h7B+hD@uzIL$dpCjDqK64cgL$gI+!gTW&5@tZK z`hI|xW_w!&jB7LF=V478)VKX0!Z>5p22MVXktJ3k?d3>3AwYp;(r8Ip6m3+xUa}x& zS13P9YBAu|3H-Ko2y+O1-PdZqd%q6YMb_dfzV7{Q-ec6CW}3{tQEFJpWPCQ-Vdkx z>2W@ltp4EFW81}Dwio+Vtn)fHHDTy+Et55obR26%17=<`1Y&m49#5k*zjo6dp1L_L zZ+;pwXt8-%uGHN>P}k{4^xSk4`MF$;2)})tYKtek{(QQkAgQ{ucduPsfsL;oaZV(&U?~b7;ND#~qz-B# zZ8ncCM|bl?zK8>{T{nTqZ>&QQ2FB8rGw5@=u!D+GaESOQFb?aGY@NqxR#L;p>okpb zu8RU1Oz#()W8lCVOG`eCO ze%0!V_`XclG$N-L@{$?1+4@12E3^$-4;OemNB(y5((+J4m5Y9tf>!*-^o~d2=YgU} zjPht4il+YFPx4;4Lj#?hC>yz~h(QgBK#`WO?{h_$J>3WO$NeP*st4cfp713w{*)2U zs959{6)Y!Oh`g;-&G&YglgJqO?$igBU<~$F=x1%kZOpM&$PVUon~Ti?HOV=@E*kjC zhwBNg;uFp`xOgSV&RaJsDpaBD{Ty;g?k{0aXynI?f^E>~3>%MFfEk!YM0E|O*}ZTL zpvL1UoZJVwp!*nzXf#x7wjRrU>Un7Vb0u{)OF?urs+?hRFA4QDUY138vzu-hsI==C z9#0rQowwjHDKD?*k>ZY;e=~YTrh75U-QM^5M-Ws|x^;&z{=0tOW<0zQl>c+U1b}`4 zrRQ($>YB057+_WI4WxJ;A4p;GK2C-(%X1v0JYsD4jb$(OM2!&Q2$J^jb`ZB|S0&hP zCUvwToPj)A2JpaN1J|1l1h)(dSpjYwVXb^qXZB0{C=YnER9DT4iqp)hw&5}Ghg3qo zDPiz6I?>%T4Y2rAhyoQfNzJemWkcrcjz0tvC!W7KSI?sJ@gXa!o~Z5jMHZwG91ySG z9g7)gLyCo$&jUFYgMvHvcN7sQW;)vuYLwvzOD=Gga4tU)Il9a01iT zPd+8K^NG32E0}wb4sAuU*@dc(O_{dMb(zO5K){Li27WI)52<;Ld73a@OrL0^4!Y-&N}j0wrXQV{r56yd2{?S6yAQyB3ZLWS@a?R0yG^HJ>UA zlrs@kOs4H)yZyk<3&57hbyIX}Ut133n8o_&8npO$Waab5t~2i#ih?{v92g}N9bmNG8Vua(sapWj6+SvC)3mw`hv#x##6kPzm`3(eQ`; zBTg%5HNxc_cQWzRcMeJzjL3h60>jJVhQerX$W)*n^@B_FN~}{TpM;ZFWX5^&)odo< zi?rZ?+s>H@)5_&)MmPuNb%o04S9}LBh4SPM6CL}}3qWbxG2-%4p_j*th{63Zv)g?E zEP}m--BlF=!s#>eF3m7U3HIr4r~5x{;5i56#)3^5UDF{e$`k)@jX=(Z7w2|uUe{KJ3{sg5_esLMK z5J(Qr5C|4T%$a=_*z(?-g|kS&;;{2u_ayrkG@2$G?O%0hIR?z&6Ad@hPT%Z_nARZVMBL=#_9T1 zwsq0}>MGdF9el?n&xM$zh?HPQX+pB1{^#^=!3r)?9j61TEfjaj<8@v$<~Dy3+IIeQ zCWI_nF8Ygu1c@tXo2*DsL{U6`TGnAb?Sex`p>@0e)?SKWeJIR=LspM2g2nIDoNbkG z{G|&UU0+WOUnfd|BUOw>SWneRDtAjX$wnvo6nP66oG!3l<_4y&)(l$o!NCkD<;tK? zPURN*G@86w{)@m)-i!R@0SNi?XDFE}{Sk=vMeLKPAgsHLwmQ1JZ>_ev(pBS;4P;ue z)8*CmegL{D+SmPyos^wpmZcWWlHUV>@?OkIt;TDayH1Wzx?PD`( zwsem4HoDP&dE8{1tU>K6!F9u&9@d_&qk?(Np%@%5z1sY$YNKBl{g zC~jv484x#mB!@4``5+Xr3QNh!JVlwN2b<+?AWw()sLM6?09p?zW@!R3_JT*Vz+GyBM7ldHeDS^<^`D zMp@&A5&rFV(9&dBeuHRVYTWUtFa>Hn4i4+hw?Zwzd?@qSZ5!f7cX#)7Z_W#6xXZLz1iXh`hJwRDC1Eyb`cjNW-Ce7{dvr@cF%T$ zE7VPMkxXmE*_-ZHQjW1G;ce%cdvMqmTJA+ZY%ThjjHD)4&bybr9`WAS<^F<7-Vd-c zyPI>M!;`g7j>f@WgNc^Q*%sV8DD^I@bPPJChESw3()(KP9@g| zi|=Vf+)=D35`H_aCzpO>r0N(xJLw2>G&aIX-g&`z?mj|!mGu7wS zucVK%tG5H&S4nov)+&-B^^HezZj%TS58mJo-Nfra=qVuiq#S}{xD0@1B;?~LPM z#>wibNuD3jID%_9#YumO~_Au!#Sh8$$D4b_{xgfuDbqm(jo%N{E z<5?+xTD@3VU+3^iW~o1za(HrYXMQtD^IJ!G7S6&$yCz1@IWbj+WpL$z9`P)=hLJ{Zw=_q-a;Iw6}|az?8I5JR{7qkzd^ zR}Sz~ur;49NcYh)ulB+BaA2b%DYEX4ua>h?rGj0=CU(pbY0-LMtzs=NyrYmDHKs9)BE)B@&;8)!hPtw)WlU3 z`VYCR6EyX8_lw(|8acN8Bprvx_!8r9H4zTtaVRZB^4V%E{Se5M(y0pzL3McP0CT<) zfmZ0ogYeek^J-}^t!hn6!%q$jv$xj9_-n6?JuD6ua&^05{>UvU1Ao%n@C~wND(b0w4_F`!9eL0N{WA|H2u;q(o zDN@(jY91-=U$a*{7X)IHl~%GkicZmYCb|pfSIj>tcD5+i`S6EUWPq)Y%TyGZu@iM- z?y4QCWNi)xXae?rEEMpj?WWl9=v%HA0pFhijLaK2tPC?{X!_}QZJlotHXjJJ4?(K> zDD}gF=RS2`vuu@N!Sp|(@s?+gFXBKY7GkAL+XGPQDbQf<6nM_}vr{G5GpDa8kx}ky zv7nRu+lP+x0u;~1YA`&7W<3z`^t|6ad7Ig?0R9BU<| z_h_zNRX5nG^wjtIwQR6y;YzRcq~rYH)mReoQ?Y|E$8$=u5J`8)sWHIfSh0BkhphbU zuhOY1*8~N2Qnoy*gU@jL) z^)O0y=Rt`$c{Fs`4oGj$a%QBQakYLyK#xI0{rP1ns7j^w2#wnuOL(R76;b_grTkFk z66he2B=UUR-0n1V{nDS(^vtnp3;6Q|Si)IvS4%1xr;&HM>cyYp9Iow_8t)elOdc9% zbMsXLum5)9hkP0TX7~4kgAUV3r@^r9^N$F@x0|wbtm1)88kTkJ@DxX`p9d{V68(8y zDp{UI{N?Jazs*~+Kc9?b(HUjXyNTjLu#qKn`YqXwFJw(;b7x5ABHzSz~NXRvBDl=Ke(GooFS5spo#hunSb zLv3sT(Xi=K=;?kK@gcHpT`zIrqUoop>(_$k31pkygLyN=F4wJ})21frWFy?#7Y%k} z4WGjrDT0YDB)0I)ZaJASSF6|x`OF{#?YlJz-jh#9q zZtHd?;lt!@x@TMi8_bCpnrJkyii|QRl6~~3aUan)X=t)@w2cr8+%^e3zZ%AI*| zJJzh-sGq!jcm$*87_)0M)-Q<7V39Q6R{z?UrxG!6>gxZ1Sqbrcy|wq>*4;}Z%!m(b z9e-?L!8OXg;a7T8z??4$wA0yUOCc7Y+QMD?`&qg${YEdq*wCH8=*sTGcomay5YxI`9s|MC5gn;mvB zLja1Q?<81%KmHO5?o^M-*pk92?Ql4(p_l6M(xMb}RK<7{Pc_$R)@IP(3$zU8e%rZ= zEcMLlAB|MWsXboNCivYq zhXAWu(Itd_u3Gpny&V+668+Lf+_7s$mDHbCTf-d{@Y(pAY*Q>XXEL(*O$P><8k(ny z+0s=W7x_3{mRqc;>TtXQEhkfko7U^J37}yJDi!mlBjuz+~?u zQAFF0tkxj|FlJSw5k{!VPrZE4(?zoT<6a^Qb^ZzROKa4bjTj_b-tX2Ftq>GP`iVzU zWfKD9`udcYql08})FohW9+2Aly<1I;diW%f5knW2tW6jOJ=@E;dYId@dbgmt+G0PK z-nKd@7Z)%o&*h2y8z&QAU9D%mXeDI+9Gm!+ATBmuz%8V{23l4rMXk20WA{1WlhA-k z*(^qEYkwEK1f2WKCF^L``6%5@C$_9kas_Hx1pN{8o z^yUz%0L|kn>=wTEt8g#1g$t@AmRAq|Y!26*_fzK(;y-2BA7zBwpO3_9hohWDkFZna z#R%)Gb{Ch78XOV&@~0YEXXTqSPUR{;qkQXU(eR$BqV@L#hh`do%1DCd34)Dv&UGG;Z?i|1cQpqIan*RLYc4Rxeo0%nnAjGAK z&#))&tM;`BJNZ4-qgT|8A)SKu+b?$|ZAaLCSYUN&qK^&#K$CTnH(^Ro!rmx^x-92@ zuW&q;MGBycLrqEhSo8LC)g*8_f8P&gwIrXzpP|lR2HCj671rq599fV6vY2e(A)QMq z*4N#AEHcA&z7+fQ$<4U;xAIAx5LB0*e(_-cC_q~8MgO8dtd7*R&bGOCLHE*Maq;|S z2UdAYqT8|ptteg|~pVV#(oedmWj&BUc61SPIOdnl1h z(J4h2f5>~%c$=`YPb@F6(uSVdlGyquinxL1uTXHPe(NFCNpqQ#LZ1Vh6*K*K(Y#@M z8q{(NHT2pKg<7YdK8}=^`!H#-5yz*_tv<+KuJwCD~?9BuLXCknQe<#_n7qy&G5s^vkkw{XjIFXy3qFM<}$4E9ObJyMz{ z-W;a;ev5ByjDd}FfHr)(m?%T=P65mC?pPp%M3l*mmN zu5Ojm)JvReFVu^kH1DnC9TEN&-c@V{Po8w~m8e_z-d;T)XeD$G<=ylYKe)`yP8Fo9 z6rd81`s|)MYGl|QiY=c~N@Jb|=!xxNDgw=I+I#+1QDm|cbOzgNCVp*ye|AxBM%4Z3 zDLv~p9!{`qim-{-%=1asdC_S0B(COfW)OaB$Z6uI{%NX#R=+k@{P)wDNV|)CF5sh+ z=}49pIMhtL_4wz}WU4S>hdW`9E`9PKQA!EcM5$qTZu|TFI}we$`9@U%4cfaRUzx|H zw|-=`!88HjSLt)^zi%-8qi1;C!F7W$r(UqiF;5e;1)Ovs&9zE9D%gC<9X|QrX_IOU zPqz?2>r!MsgI;hWOGg7`E6~>;-o}2sXf@Y)tMQ!FFxzL~u1<4gW<@S5^SqTZrzKr3 zO9$wxnp_zhLtqQA$a)c=ITor)Qf(?QVdN%5;!K*;%(}IP8~$m+rTye=1TaP#IlzS% z0cw4?;kX9_;^S_*A#+*&dA!{SzA|G)`T$TGfi3JTA1^KvZ1A)f$Q&I2L`ThO&Hg6QS0XwIm%&PM=7`RrfV*AY95#xisqTU~@EwkGWc6=M-(V#-UHUNaocGf}GdC4! z!&`#C1}Gr5{JZ9>1Dw6N=NMFJ5oynw^27CXlcL(px)!-x6N=6^D-QV^7_?o5 z5CS z;K*(Vd{4#N;8tLm=VcT{pJsk}$MJKhr>-i3Qr};q9yowq750qyJ1U zzA1RVG&QaB?mpc97`-nJhf^eZ+LeWGSc30fV{;?>4Z2pjsWEHiHchiu%h$1-U@yDU zskX~asq0)N)S|oBnfYP>k=uO@p;b*8dE6T5Z%dx9Ii=7Au%NBv*`|1N=zEj=*p_S% z$tfD$f!c7Dqmta;RUhmS$)4nDvV(<3hPe~*eEWL;-pfmUyqW<;^uuo=P^`(2Df^{9 z8PIpR-8oPHa@OK%Pn^eDA9l@*09snJlz< z+AbFqh2xk7`i2GPR(9K4S|*v4XqGA8%k)78Zm$zOk-KQKA??4$wNNLHudbkFjXr@M z!JpAh=Wi|PTy2U!89%HD-_;3U7Uixid;&C4men#6SARZC9g^#6iN(epgGgldkHbwi zNGR`+Gb54?6$m>x8X#aCu#?syLRlGm+7U|SS0iK04k(RLJ&G1p&Fg5@M#RoOd3EQ> zV&-v~o61p+lCVKDD8HA$Hbz|KYCW;q4fE_$b|1I}@L?2(HlkU*vh~J7U87}irwt(E z5=Ou{mJEc5+CoHg!n>@hK&E#2n$FoF@{Fid{-rhdY##N;*&4$h1zX(*YlE^FFOE;K zIWtD)V|!LQKrB#!3$B7D-VxhG;)@e|Jnr9bNnvGcD`w6(4Syw3Og*awk7Bx-qMK_#?kqtFhbC@o!Zdjf7`HV_NRHITR6sH;FzoHu zXTz+BK61x+4MbPdI#@ugUL7mhj@f<=@w|dy&g)YDMyZ9)q_z8`=f-60f!z}?QZ!hO zr?(wQtBF)mB!NVotQx`y9TB;aHv5))4 zqHIKF5&+QF38i%6PG#PVj3UPiALXJ5b(VOrtn1=ymo25d`(>WoaHM&lUTDlc!=irQ z&p)3HPlP)>9g_FY{8_?qYRb-7+rOz<8T1qIZ9mD?>tZNvT=0PM9<$x$weGfz5$vVA z!P8rT8ROoqR{AU9gr5g;EOG0HJ%u8>`*s!b8dY7HtI(+GjNij#*q+HtMRZOD3&Z}L z^hGl=C*SxWUiVB%{;SIxovnTxnqT1%r;>O(kde9g~8e9$k?Di!U8OC`Z zU;J-7uKl%Ux&_v*p3$2q53 z(m=xXTYqDMNmpQ;m6r2W6+-jFPyD&i^bsC(J5V&}=k0pm>jZei)5g z7j8cp?0kv2bX|}pGmy|MZb6+=dM!9cZ}5@}>%(ESD`7_0vO+wgU~a<>-}94mGANP! zAJ^{)|9$)a6QZ#U{Aiki1OTRq008uV7XP)3T&>LQ%^ClzXZ|mcMteFMmlLfQ^Gz_a zotGfKH{CNtuuwSG;;RVhz>FINxzxVa&skY1G-;jhr)FG}?hzM{bXZ z5RpL;y*?{Ym{_GrZK5l^_0NaoDkn*BDzSfAf-h7{j!>j$nbv{n8vH9dPO(&+JTp_JvSc4Bg7*jquhX3!(91agRm=n(UXDue+8n zi<*6PxGqTY_|!{wgse}4r9NAwJFU%uR{h_Lbhw`IG6c4|FEYLQ&u@?hA)vv&9(X0x!4%Qg<(t)XS*CKMeguTlrYI?(adxKe@c3}{@SCdkFqc(QuD7%3qH zN6bM$s3E2zJ@-z73No-W7dzR!*s4L`p|dn+&@ zYaH{ty}vFFJq$lz7gd2>(>qi^!aXEsCtz+cFIx(*NiI<0;*u;JWJvp``+*CVFuAGl zv$#=#BRx?Nh8|R=d)KI0M;*xT*?9N?Nsw%y9mp1(2$-i3^{JAD{-Qj)<}&R4SgYd9 zk@f~Yw4$Ah2WQtt{wT{;y}UW65ahHdi9UiK8ZpQptXyI<^Hnor@EqD^-2Pbs;J$^)n4*6Pfmb z09sJ&!F+Hme25D%|0Su2a6m9u>-dHRFyu;}>OO7h63PrYON!!VDtsd%#UhcP=NytL zrY6T3&8$BEHWN-68lj>tso)4f+5Qv?>#*aUKq9&Jx%D(Mm&%u zY^=0vT7-1C1-nH;((($ousa?psLUxCsqH-zUeR>$i24*aXo|*X%px0iE=GMR4k^7k ziN=90Zg>NFjr3p8CpB?2VLyvzXps+|%Nh%G96I=H#|%QB%Km z9Yipz&{P{2pDu=s-CoW$QBhT7Nd6?U)p*M_Y2Y72%c?9MT$a*G8}2}7_N2A`%hrspHeC>U-OVDM8r zgJ0XmdveglA9(tv9;+y@*S4IQzE`+2%f9j#yyzXO!1|ohUzRtoD$TRPa_w~auC3~= zI7Ih1feLBQo|Q8T9@#2@sHW|j+IRB`ili+a{%&afsug-e!vG9eeJp zP1_HtarVvl*&lDT=VolVMfGE+(%C=gluhvc6zgo}+R6OTtfNc8+z@X7Bu z237>gElhC>{i(2?B_zB`n5~n!@9C$J-pp>VZJ%)SDjRwygvPcxHG^ayUKOFgWUtUS z5>Ci;z|Zos91_om%vP>Kh*a0F(A8yZJw4Hq@K@BRav^Cer3q+eTGj7PlVb_G(l<|s z0-eHk;6~iE>0EfM+zehXe*&id7^Y{dy!=t9Uy~;LX)}K5)+M<>MFK6mWOfzwQLl(G z>G9kVaQ^AnM$`N|##-xF2LgkQ@#k01!C`MNl$Oqs@iZ>mtFI|Wo_Gj zxAZ*Gm9=v9f2XXQ%y-eS{`~-f3JLH(DN+}6S66EXOPBuyzb4J?_-rmDf5FqQ2&&n1 zct|*KH{C=+z1A}W#c7S_i@;#+z zX72(K76bkG`BYYdd76FhD)*IrMvqf6ocLcP>I$$1^XIpT6StkMz28qqNV1h^<>Sao zYAg(m=eGtGJb4P{EoJ11nq8&nfw~MbMLWtDrR2!Z^8DkT+Ni@E)kbGrZupc~9&8`p z8<8|uR21Zs;OV|qP0~}Xx@q%Qzgv-hueGvk-&IpvmrTW}3IlI~G=I$6tTm7P1Q(VdWZaMBiKEuayRZ9T$=ewk4DT!28O`A)9m|fU();}^`fYvB%8*&* z8-T5!UtG~b9(D1PPu*;?jh?~uVHRs0l-AQdU`Cn|h$DbZFi9M7SE8F%=};JNM;dA6 zBcZuqjn6!zJ8Y}uMYQ8`!b6{P`I9KK$V5LGE^Z@OQ|6{H^kJ$=68P~j^uZIKthyO{@r z4dQs^f>yrrLzEO9Ok$BDU?Ak0TZp#%o^tGYv%Zv?yfTRCvuIk5Vj*~`RSufLp%U-z zW0K?FBcs=WkVb8ZyfXD8Qp{XakMGWWiobIteSu9lZFKQcO}=w$Mx|fJKN6UHx@59F z3h*-;X5jr;V-#?yRj7<3gmB@s=3hDqtWfXg zjC)JBTIg?})Z=>c!W*X!klpMKm6U>$g7lTSU~3Labk+ zFVO4tt0E8@9T_*e@J_L3teYGKnY~R5b?U;gw5I2tdUa3Xn)x%{Lt$T;dnQ11)6F&T z%K?gvmiYDLKUF4MyQp4*j;^6T!ph-;H?k*w0HMm32 z!7V@r2oN+0PO#t!gS%^hJIS}t*?Z^gv+i2o?>n`oS9iVrJT+7EcFkK|PgjAl{09%X zpxcBNTB~D$6Izx7Z>VNJy7OU0KCgiyr?@7~@+h!z`S0PrqL&=?^G0Mreac-N@1Fu7ZdCFVPqg-9+M%}s*R#gaCW+NqhHU%%q~s{@IfRT5+E(d?#iinI(rB)5m6eNm7?=X;;>5 zYQ=tY_liw6Mg^q%L$7S5Us`9BX5>6xW5#WKZ=WN0Tx;8zuoU#PD7U?=gz1gMfJVzH zVS`wh(X{RNS^|+zSi)h_I)|CI*ctN*1`BOu5hE`{8?eP6D61~ zb$aJaC45(&-&JRFv#JCZ1KHrU33-gZ99e7FeJzbx&3`5|c}&Swr3fY?E`t`tO;a3% z9*TK9`nGQXqU+c}6|3K{(a*{VX!mxXCVO06lOiU@LKs`7z9_he#wHh7rmI{}=iMJX zzz-6e#G=2V$d5NWnQ|$-&mXv5E7`czYL|4HQ%DzY9%m!{as4Rue0O*@FJ(Lr;JJCJ zrjaX}2)~bS)JfQez!He`^}&MaR(vVGFaWiyg@pz}jyIa-9(* z?00&Q&&8arqIvA6Xp526m?GB$`MSYmRMzKbt&^#k+AuVVi<9?k$G=&=%vqWE8NtD; z#fKECr$16bL^=8suSrYN^iuk#=&g&NyhH6KVXijaSWFG>$}5&ud+KB2o8; zGMKg0#as2PZ_;(C+CF2)&JPk>46vxRsa3PtnVmi2#qpg)7C0M0mn`a`nG>CY6oKcu zbjE--klG%L0GGU(l{Q~ED1V&Q=hP#?P(W>6>5VOCvr|QBwQp|vYJ_@u_ z-e*b;TpfhRt8V)C3DcUgY0E{v$HJV=@(vZu9pb;T6Q?x^m0Z-``*NhSxj7G&6pf6vZ{g}WLTn+v791TCER}@Tz^1LD@6mjL_wV$&0AEH|ahyWtjJa)6 zh&}j@3-4upjF*y}xosbjEe1V}+uiu%(v|pYm-%xzi{)4uuCc?~Ogd~5YY44-B zFRnH8J^NX_I61?h-ey$vcP4^Djz!GaK7v1aY7#(r3q$%W(h+S_%0};Ejzj^`Mvizb zBuC_)z66+R=IwJBYUbZT)%8UAeLX3!r8rW9+^g&9`8-DuqNvF@j> zZ5{&;0gr(L84dp5iTrUGkV>NHtqJ_#eQNjO!l|U^hc0ZfNuA+)x84;~MK4}MWA?T; zpF@hr0#vZ=_;udZI|Xr6DgpIsH3|FqTN|;b8zpC&n6iIhayKVrf70fa(PbC}FQFFw zKy*Bm!=4VuqG z2N}46ta-p{7Hw{~*Rw7|Ojm<%0VLb~6W_oo#V^2xQLASct6QfdyEdB&(#G}t=Fv~D z`zH{`5`*v73m&en@8ygEOaNRasq8M@2vUXEBHa#QiP2tUlg1w`4XYK4!lZM_#O}Q9 z;>L}8wT9xD{qFrSz9cMe$F2Z?NEVe|W;JF^<{0#K7m-=(nqA?3B47SfVA4jcZ`yO) zOQZ1vIg*+u_$Po2)DC5=<>dZt3Tqw|sUhKEAlF8}+KH)SB7Z7BZSI!IkR_sP3?doA z3(%*o?>wWN0}V~p&^~+|o5q3YVu}P48+$m7=$B*(mBHnbgYP5K6y%ydbn8%2vl~3C zM;;SkSg~On=I>ofhg@ErnS1z%03w zc>lR;``li4FVU|k_>;!1oT3oHcO?lME6p`~-rLMMx%QHW_at^nsF&QjGy})+dv%VU zNPR$A6F$uNZysDb)&t+T*x)+ZA71*dyQc&Pf%###1WllRJ!h^;weHzJ! zXQ><-VzdjPyde&~&GSZIIt&|lX7bEWUk;TEX6BaDFg0Kl^}ao||E^U5v9zLhWTg1; z!>E@Aalo=ZrO?Hmp?-02^RT4u*jE1&HmO1zXY0x7LZTkW6LKr=gnY?&?BGE1uIS<@ za5MF!9Mv&5W`6LfUz6U$t;7B&Z#mXM?R%O$l_pYVzq1{KPtVy725TN`_J zVf-0X3erj4I}b$cNlAfmo;$vuJG|v?@&Z7Tc-s;ViisC`{6%hj_aFghSjHkO1ESY- z5A`M_jm9}X*ZcS;VE;fAe}_WxFjyJ7p#9t`fN?I{?kLG>Ptc-D?oH+VNd39RK#K=+ zQRKIX)!Y{OqU>1*QjO+4Va{9JEfR_}$c!)WL?4Qe?ifF7Vt%;JuctD;Jif*^>HdsA z<{6_G!G0=pY1a6~b4vD`fUoj!bx;nL6lp$=P}C6hNj*6hxwVC}#*mMKx_2-YX4=zs z8+vsA3NFZ(uw4lFLz-&_bHZU*_+zJ)d4l4n90j(knWGd z$AU|?zm+P#oJ1`u?qFiWwX5IF{KjPQ8VC$y0%y5Vq~yv9EO%Y^D%W!qN#AC=c}OaD zN8^DVTc4!hAUJ)7_;N#WyB(iG?;@=lDls*pcOsCgp9u~R&Cvq(Xm=!{UX(mJGtk?* zDbC-qcMyf#PljNdv*<&oU;WUhDHC^G*#SGy{emBQ zfTSo#<42x|y70;IrpQkP!x$cC*s1#UQ6wI{5TH3fCTNQrU}XJl_ddkR4JaRhQfw~s zN@L5NGZ&g+;v2*H%mlf)rH1Z!~qJUtE_QS6`ps$U2YDtz5%^nHoD5Jf6Waf zI39LWb~ec3RvWP(Z(o{S+S0xVBwgW(F>TX*e!I()lG2f`)6;3y($!tYTkuuEl44}} zl$9$(N8x)fQHq9-#r}lBz_f)B=Ri(YU;4V8g3{Lw%dU0n^!HYNGZO?Ye#;Y@yPiXh zD!ANw-m_b1Hct0%M~|sSeP%Y~9H0Z%3siPab89bcoQ#{8)4n~ynx01Zq$)H5*>8{j z=#;3DM2XDEBBW?z)Ji{(bP*jAKb=KQ$nO=f+s-1jIGeJinsj=pnb~$XgA6{v_3B6? z1oLl7Gme(uhF8_FLTNsZejmV|5IKO*i=xsA0l`n#_!YvLgcLWwu0?E7l`P_I*&>gj zL2Qxt-6k;RZ-MMltAsWF!w8Fbd0<3mv?g_WB>i%6YYbI(VK7#`f-_XJln!-1?;=<|E0Kp!tKO{|9_O{bZ^J4(dPV?_FGl{sMtlj zNn2yyjdn7wxjC> zuUkS1ud{q^qsB3rEr&9=z{P2TDeVC#E|Ygipk>I& zFhQSc`rKCh`9%VEop1}*xZ09d{I**LPpl0EsF%RcF)CslM{Gx45pCr)50$NQ62$)!ZD2puOtBc zpt*D!wiosnQ;OKXP1yMF%TajlwPjgdV5GFubV~AC*!cCoNb$O3?ctzm0%0d*6f(^k zHC3zz+QnqGSzOHFNWCR>AD*HdcSvNH+rX|TQ9ryA)DJ0BB(KUxy4W@IZa%2*Fm%wLiU`Qof3@Jy zj|kUw4M}r08(7o0E0ag$&#xq96s&%Zqft|r-D+Gv?6?H6Z%!nJZs;DESo%@v@Oo{ccp~VeHj_lctj@HC z`JKGpHxV6$kpn^lY9m6UyR{m~-dhy3z5+To0jz>LZ~}KO8CF?e;t< z#elxUjOoM__XQ7f*L4wByu@l_zO1M@;TO}p35r>FVP9^jrEU-v+dIEqjV8ZRFu0%p zSlqr6UOC%xrZ?+hpcGiPw(Hq6n*VH=K(_hEv7k1&X3`*9Pb$KdYmE zu*0EeQ<2Mk_ZzUw&BNw)I1tcdSFl}$rTPN8KbqO^Ey-Ju5;p?n9*wDFD;2_e7fe!h z_|)eNHksG@xbiek``toi`oYB)XDpQNNB4V3P4d)`p3HMs8#E8p(g#xciPUV0ve+;5 z_@ylbv**=kmo^YJCxXxQj~Uy@Xz+O**$a@0?u^Mp^719trJ2Obk~fQNl!1wy62S`o zI7UUXUra0QxF}SkVbcXl-3VWG0A)vCRoJ%rX5t;2#=5FSM!uSiqT$DruQ zycewqZBH?^7vmxrjec4W?&u2Rq7f-xQHy^34#MiE3Iuua9@_1!s)zHRCdV!{2s-3M zkQqo%KBerT7~~RUmeu(vg+6RZsI|^5;#o#jOCpgr_*qvk%05aq(z!r=3!mzDcBg>6vvJ)v#fK4^ zpkhleUR+{kn=&0S{Z=ft&jJP$anzZ>XFj_Aa+-uEok!$kiUDGuaNQRY1B%OU@53Vu zWCI_Cu!x#w_764Ud6QQ0P(Ix=_D4Tx0zY{RMk%gDQp!iAy$U)4abyMe)eSOG2<=#Q zx+!J$PTySZ>a}ZZm~yoH|LkW5R55pO5?5R}noz66Dwn)_;oja>XBiR13=Pm7Z6#dF zMfZwuiOTDYyx7>dYj?o!jsgoJ`75nh>;}8cUf`SLB zRZ?){0fi5yfof|dAXa+J)MU*q6r+O}K!O}od{V0D!K}|3hu0_OgZ}I!0B%jn^|Pui z%IXx`GPk;%K76v!3jQj)AJ+|AivvlgizSAIp>~F$v72vqt3V=LX4g2&lLcH1{g4Kd zN#V;*GQJHUPIfyNiK#7*Y*j0Gsy1`OyW_Daf>(Ekp(1lo{|)F#%w=2$u$vJbH&y^7_X1mU;A1%nqf#I4kQ#?I7X_(D8Ntr9Hj}eQ*w^G`_XQQp9y{xqJ zv`W(SE6!E{JR|&q_rU(LsrH7}Kztv8V-K zVE&UZt1D8zs2n!)w;9a(?*SES*aZFBX6BAozXnv&3|toIh+eF)>`Uqfy97mh=%iGt z7}()1^v>9gAwkK~Vv*M_4t+kT?o5G*EA7?ZrjgNp;|rAdmb1WV)1A$#Qkog@*^3GV zXqu(fkQ>oeP*5vx_6D0k#9Z^W9fVQzeO}O=``aucO)_e!kU|zJwcan8_5^Yi8DWe; zoa_v58ehjSBEGXHcPlY|Hcg!VUM7Nzwb+`p)D3_}8P^uZ5c|^ngzh+o(9H1RDLxAD zu_uVV1MI|A=u|Su&dfVOu`#SS0*#76eDqnAPPHLfpJapL(PZ4G!Z!f~?F2S;FFI-I z_Tf-69q-7BBk-li1S2(c0?|(GoZV8Aj^+?GGhTMt8IGR|h zM|=7yb7a!;I^gdjSrBwJW%a4)Kfo=z-QtMk2O3~hjK73OZ+!Bi4}2oFn z(s%mVGTn~w)g#$yl#Y)O+~Dr!kHamwjOpnGKyU2hO)uPNN10m4#p6(2_doXXG-A6& zr#?#85K`WC4HdR(58`JbLvh&3o@yo*fGj?*O)Bzc@JcIFD%ahX!>Zpv+kJ}A{>6=H zziFFVp&lBSzzC|s2JlsKdO8BMR6UPgIle4NW^~hxRVk2OM5HGFp~-3mZicXQOc%k% z^FA@gi3A1u`wKjY{Ny^-bx20uWz)CxNMO^Q(W6x}m34P5E}ou@OpJX-e#7e3P;R-F zC`KFeH94t|5vksx3FHc~0PWUp$|hZDQ`n9n;OfVEV?l5LgG;z!qalGLy-!*a*sA1M z{A|Qfe+sOozlA+LKO}1(nVk?pjl0z6J1gsCwwFaVDtjd5)jxjFmFSvqg&n8>g^T+yw}#Da|9@gtSk>hwuc-C#;C)8vwHM0I!XvlOG=7zN&w}N=F4>e4w zlj4XJd&qQ5Cz!fA2g_9}o*bE{qCkbeeqaaXXo|~IZhFF|zof+L!vW9-xs}Gzje@P? z1quxsQF4J3)Fj`~7!$tf^GtOxh$=prSK7&YOaU*?OGGCn8Vi+#W~-0P1hJBcM) z)S=)iGHL+hreB)srN46j`X=~+EM&A`CA%uZaK>0e%5T7S2f|J_cJ%lxX^FEdwCw_- z7O0Gk2UeC~)aI;77V5^riemG18m~2^>cuW#mN6B|j@=Df4j z5);oR2|+Ny(=w3)nub_A*RGxpiD{Zc^HSXqcw=(fNe`IXk~W~Ho52#bp6C0J;PUDe zPhFPdY0rR{pVS%qK#`~vH#F```jb4&4j@KverxR{#zCI{em6tT^J#%sLP!fT0qZQ? z?%Iab$02eC;N-5W#azHP`m)qvJP^-hvcP&qbWbElcUd^KY(hLLX6x}cgVsn^XJJbC z5Vyj3^G-t!5lY?_D$Cax72!q39PE`NWcL#R#c4qMx~n1`*9k9O08V~v$p<~-o^2X2 z^*U+jW4+I^_j(2=>SNKg^mCZ*v?Y?jW3wb>Rr;03cBZl_czOKz!nZ0%sITol7*wS6 zGu9@&84qXJq2*uo(+f<&*W8_1*FfRqOt+XL!}@v^CCyKPZKrk&ciKp`8(toXfRGU` z%i~KIxa}-VFX8Pe^doXkR7lL1tj5~g-DZ>TUVRC&tfa(gO<05zlBm|{c4;Rae14QX zEdNk+Tz{%rdcCN9?$n8-mv9&ro?a%ZV#RQUxI~}niE!s|IiBxuuUgXVPeWT%US~rb z)Ju~mq94935M2QKb^l)z4v)YI3mE@-<)B|q@R!X$tRtkZ^ml^4uWj-dEL;KXV*ATV zC%*%KU-aQmXctU<@*hh;{0{&7QxkuJ;ov%i{(%3NXDEJW`dv8xCl`|Ff05DuPVu|+ z@J|Y4=|3p`Dn$Gp{=4MnPq?!DAMn2kaDK=CJ|g-jeqZ@t#s7PJ^mp*@9f?1|r$&E( zf9qEK&hUGH`zJ%5$sY`V3wM9V|2>fS6AcIFWex}TA7RDs@V__ie}&Uo{RRF Date: Tue, 27 Jan 2026 12:13:43 -0800 Subject: [PATCH 10/13] Okay this should be in the requested format - we can discuss in person shortly --- 00_cNF_helper_code.R | 40 +- 01_normalize_batchcorrect_omics.R | 240 +++++++++- 02_analyze_modality_correlations.R | 239 ++++++++-- 03_leapr_biomarker.R | 110 ++++- README.md | 7 +- analysis/01_run_normalize_omics.Rmd | 200 +++++--- analysis/01_run_normalize_omics.html | 659 +++++++++++++++++++++++++++ analysis/02_analyze_modality.Rmd | 181 ++++++-- analysis/02_analyze_modality.html | 591 ++++++++++++++++++++++++ analysis/03_pathway_enrichment.Rmd | 177 +++++-- analysis/03_pathway_enrichment.html | 590 ++++++++++++++++++++++++ analysis/README.md | 528 ++------------------- 12 files changed, 2887 insertions(+), 675 deletions(-) create mode 100644 analysis/01_run_normalize_omics.html create mode 100644 analysis/02_analyze_modality.html create mode 100644 analysis/03_pathway_enrichment.html diff --git a/00_cNF_helper_code.R b/00_cNF_helper_code.R index 9ae26c5..388c35a 100644 --- a/00_cNF_helper_code.R +++ b/00_cNF_helper_code.R @@ -3,7 +3,7 @@ library(synapser) -synLogin() +synLogin(authToken = "eyJ0eXAiOiJKV1QiLCJraWQiOiJXN05OOldMSlQ6SjVSSzpMN1RMOlQ3TDc6M1ZYNjpKRU9VOjY0NFI6VTNJWDo1S1oyOjdaQ0s6RlBUSCIsImFsZyI6IlJTMjU2In0.eyJhY2Nlc3MiOnsic2NvcGUiOlsidmlldyIsImRvd25sb2FkIl0sIm9pZGNfY2xhaW1zIjp7fX0sInRva2VuX3R5cGUiOiJQRVJTT05BTF9BQ0NFU1NfVE9LRU4iLCJpc3MiOiJodHRwczovL3JlcG8tcHJvZC5wcm9kLnNhZ2ViYXNlLm9yZy9hdXRoL3YxIiwiYXVkIjoiMCIsIm5iZiI6MTc2MTgzOTMwNCwiaWF0IjoxNzYxODM5MzA0LCJqdGkiOiIyNzg5OSIsInN1YiI6IjM0NTM5NTUifQ.OkzJaWKY7po87mP_iKDNP8ah--BsZpb-mH8U1wP2kj2yqnxi65yi1irAgNNITXB-mQ84ndpdV-9SfNCpkc-pUIenHADpLTMPbh2dMLAiCS77picR-bLlb79c7e005L-7EWuMBPh7RXAXFOcEwY3823Z-RsjKatGYZXCiGaC_I0OPuawh1c3y_c2VVsZUFMdfXJaQH4iaMB5wK9OA_OlI_mnwaYrL1C3ID6ZCuGdO8cR9-fwgu5NzZPq5OOyn11BSblb7NytwaEhhuteHediXBnhGSmfSCUCpQvdpppb03_-Q8DK21oyAgUSygn_h7QFIqtdfzyb8mg300D49al-yAg") syn <- list(get = synapser::synGet, store = synapser::synStore) library(readxl) library(tidyr) @@ -28,3 +28,41 @@ pcols <- c(NF0017='steelblue',NF0021='orange2',NF0019='orchid4', NF0023='darkgrey',NF0025='lightblue',NF0026='yellow3',NF0027='magenta3', NF0028='lightgreen',NF0031='pink2') + + +# This function is used in the notebooks. +# All it does is convert a file from pdf to png so it can be easily displayed after Knitting. +pdf_to_png_if_possible <- function(pdf, dpi = 200) { + if (!file.exists(pdf)) return(NULL) + + png <- sub("\\.pdf$", ".png", pdf, ignore.case = TRUE) + + # Rebuild png if it doesn't exist, or if the pdf is newer + rebuild <- !file.exists(png) || (file.info(pdf)$mtime > file.info(png)$mtime) + + if (rebuild) { + if (requireNamespace("pdftools", quietly = TRUE)) { + out <- pdftools::pdf_convert(pdf, format = "png", dpi = dpi, pages = 1) + if (length(out) && file.exists(out[1])) { + if (file.exists(png)) file.remove(png) + file.rename(out[1], png) + } + } else if (requireNamespace("magick", quietly = TRUE)) { + img <- magick::image_read_pdf(pdf, density = dpi) + magick::image_write(img[1], path = png, format = "png") + } else { + # If no converter is available, just fall back to the PDF. + return(pdf) + } + } + + if (file.exists(png)) png else pdf +} + +# This is used to display plots in the Markdown Notebooks +show_plots <- function(paths, dpi = 200) { + ok <- paths[file.exists(paths)] + if (!length(ok)) return(NULL) + show_paths <- vapply(ok, pdf_to_png_if_possible, FUN.VALUE = character(1), dpi = dpi) + knitr::include_graphics(show_paths) +} diff --git a/01_normalize_batchcorrect_omics.R b/01_normalize_batchcorrect_omics.R index aa00a06..d263a21 100644 --- a/01_normalize_batchcorrect_omics.R +++ b/01_normalize_batchcorrect_omics.R @@ -47,6 +47,13 @@ suppressPackageStartupMessages({ # Small helpers modified_zscore <- function(x, na.rm = TRUE) { + # Robust z-score for a numeric vector using median and MAD (less sensitive to + # outliers than mean/SD). If MAD is 0 or NA, returns all zeros. + # Inputs: + # x: numeric vector + # na.rm: TRUE/FALSE; whether to ignore NA when computing median/MAD + # Output: + # numeric vector (same length as x) m <- suppressWarnings(stats::median(x, na.rm = na.rm)) md <- suppressWarnings(stats::mad(x, constant = 1, na.rm = na.rm)) if (is.na(md) || md == 0) return(rep(0, length(x))) @@ -54,11 +61,22 @@ modified_zscore <- function(x, na.rm = TRUE) { } filter_by_missingness <- function(mat) { + # Filter features (rows) by missingness: keep rows where <= 50% of values are NA. + # Inputs: + # mat: numeric matrix (features x samples) + # Output: + # numeric matrix with a subset of rows retained keep <- apply(mat, 1, function(r) mean(is.na(r)) <= 0.5) mat[keep, , drop = FALSE] } union_rows_fill_NA <- function(mats) { + # Align a list of matrices to the union of all rownames (features), filling + # missing feature rows in each matrix with NA. + # Inputs: + # mats: list of numeric matrices with rownames + # Output: + # list of numeric matrices, each reindexed to the same union row set all_feats <- Reduce(union, lapply(mats, rownames)) lapply(mats, function(m) { mm <- matrix(NA_real_, nrow = length(all_feats), ncol = ncol(m), @@ -69,6 +87,12 @@ union_rows_fill_NA <- function(mats) { } collapse_duplicate_features <- function(mat) { + # Collapse duplicate feature IDs (duplicate rownames) by summing values across + # duplicates for each sample (NA treated as 0 for summation). + # Inputs: + # mat: numeric matrix (features x samples) with rownames as feature IDs + # Output: + # numeric matrix with unique rownames (duplicates collapsed) if (!any(duplicated(rownames(mat)))) return(mat) grp <- split(seq_len(nrow(mat)), rownames(mat)) collapsed <- do.call(rbind, lapply(grp, function(ix) colSums(mat[ix, , drop = FALSE], na.rm = TRUE))) @@ -77,16 +101,33 @@ collapse_duplicate_features <- function(mat) { } make_dropper <- function(substrings) { + # Build a function that flags sample column names to drop based on one or more + # regex patterns. Useed for removing protocol optimization runs. + # Inputs: + # substrings: NULL or character vector of regex patterns + # Output: + # function(x): logical vector; TRUE means "drop this name" if (is.null(substrings) || length(substrings) == 0) return(function(x) rep(FALSE, length(x))) pattern <- paste0(substrings, collapse = "|") function(x) grepl(pattern, x, fixed = FALSE) } # Functions to clean up irregular names +# Extract the filename portion from a path or portion of path (drops directories). +# Inputs: +# x: character vector of file paths +# Output: +# character vector of basenames basename_only <- function(x) sub("^.*[\\\\/]", "", x) basename_no_ext <- function(x) sub("\\.[^.]+$", "", basename_only(x)) normalize_specimen_like <- function(x) { + # Normalize specimen strings to a consistent form for joining (lowercase, + # remove whitespace, unify separators, normalize organoid/tissue/skin labels). + # Inputs: + # x: character vector + # Output: + # character vector of normalized specimen-like strings y <- tolower(x) y <- gsub("\\s+", "", y) y <- gsub("\\.", "-", y) @@ -101,6 +142,13 @@ normalize_specimen_like <- function(x) { } parse_rna_header_triplet <- function(fnames) { + # Parse RNA sample headers expected to look like "sample.T1.condition" (dot-delimited). + # Extracts sample_id, optional tumor (T#), and condition; also builds a normalized + # specimen key to help join against metadata. + # Inputs: + # fnames: character vector of RNA sample column names + # Output: + # data.frame with columns: fname, sample_id, tumor, condition_raw, condition_norm, specimen_norm toks_list <- strsplit(fnames, "\\.") out <- lapply(seq_along(toks_list), function(i) { toks <- toks_list[[i]] @@ -149,6 +197,12 @@ parse_rna_header_triplet <- function(fnames) { # Functions to get data from Synapse read_wide_from_synapse <- function(syn, syn_id) { + # Download a Synapse file and read it as a wide tab-delimited table (features x samples). + # Inputs: + # syn: Synapse client object (synapser) + # syn_id: Synapse file ID (e.g., "syn69963552") + # Output: + # data.frame containing the wide table (annotation columns + sample columns) message(" Reading Synapse file: ", syn_id) df <- read.table( syn$get(syn_id)$path, @@ -161,6 +215,13 @@ read_wide_from_synapse <- function(syn, syn_id) { } detect_value_start_col <- function(wide_df, fallback = 5) { + # Auto-detect where sample measurement columns start in a wide table by looking for + # headers that resemble file paths or RAW/mzML names. Falls back if not found. + # Inputs: + # wide_df: data.frame (wide) + # fallback: integer column index to use if auto-detection fails + # Output: + # integer column index for the first sample column nms <- colnames(wide_df) is_pathy <- grepl("\\.(raw|mzml)$", nms, ignore.case = TRUE) | grepl("[/\\\\]", nms) | @@ -175,6 +236,14 @@ detect_value_start_col <- function(wide_df, fallback = 5) { } parse_fnames <- function(fnames, aliquot_field_index, cohort) { + # Parse sample column names into (fname, aliquot, cohort). Attempts to extract aliquot + # from a specific underscore token index; otherwise tries the last numeric token. + # Inputs: + # fnames: character vector of sample column names + # aliquot_field_index: integer token index (split on "_") or NULL + # cohort: cohort label to attach to all parsed samples + # Output: + # data.frame with columns: fname, aliquot (numeric or NA), cohort message("Parsing filenames to (fname, aliquot, cohort)") rows <- lapply(fnames, function(fname) { toks <- strsplit(fname, "_", fixed = TRUE)[[1]] @@ -209,11 +278,29 @@ parse_fnames <- function(fnames, aliquot_field_index, cohort) { #Feature ID builders ##### build_phospho_ids <- function(df) { + # Build unique phosphosite feature IDs from phospho annotation columns. + # Inputs: + # df: data.frame containing at least Gene.Names, Residue, Site + # Output: + # character vector of feature IDs (one per row) lsite <- tolower(df$Residue) paste0(df$`Gene.Names`, "-", df$Residue, df$Site, lsite) } + +# Build global proteomics feature IDs (gene symbols). +# Inputs: +# df: data.frame containing a Genes column +# Output: +# character vector of feature IDs (one per row) build_global_ids <- function(df) as.character(df$Genes) + build_rna_ids <- function(df) { + # Build RNA feature IDs by selecting a gene identifier column (tries common names + # like gene_id, gene_name, Symbol, Ensembl). Errors if none found. + # Inputs: + # df: data.frame containing a recognized gene ID column + # Output: + # character vector of feature IDs (one per row) cand <- c("gene_id","Gene","gene","gene_name","Symbol","symbol","ENSEMBL","Ensembl","ensembl_gene_id") hit <- cand[cand %in% names(df)] if (length(hit) == 0) stop("RNA feature-id column not found.") @@ -222,6 +309,11 @@ build_rna_ids <- function(df) { as.character(df[[hit[[1]]]]) } pick_builder <- function(modality) { + # Choose the correct feature-ID builder function based on modality. + # Inputs: + # modality: "phospho", "global", or "rna" (case-insensitive) + # Output: + # function(df) -> character vector of feature IDs m <- tolower(modality) if (m == "phospho") return(build_phospho_ids) if (m == "global") return(build_global_ids) @@ -232,6 +324,12 @@ pick_builder <- function(modality) { # Functions to Normalize Data. Uses SummarizedExperiment coldata_tbl <- function(se) { + # Convert SummarizedExperiment colData into a clean data.frame with consistent + # filename fields (fname, basename, stem). Avoids name collisions. + # Inputs: + # se: SummarizedExperiment + # Output: + # data.frame of sample metadata; includes fname, fname_base, fname_stem cd <- as.data.frame(SummarizedExperiment::colData(se), stringsAsFactors = FALSE) if ("fname" %in% names(cd)) names(cd)[names(cd) == "fname"] <- ".coldata_fname" names(cd) <- make.unique(names(cd), sep = "_") @@ -243,12 +341,36 @@ coldata_tbl <- function(se) { } looks_like_sample_header <- function(x) { + # Heuristic test for whether a column name looks like a raw file/sample path + # (e.g., contains slashes or ends in .raw/.mzml). + # Inputs: + # x: character vector of column names + # Output: + # logical vector; TRUE indicates "looks like a sample header" + # Example file: + # "I:\UserData\LeDay\Piehowski_orgonoids_Feb25\RawData\1338241_cNF_organoid_DIA_P_01_29Jan25_Ned_BEHCoA-25-01-02.raw" grepl("\\.(raw|mzml)$", x, ignore.case = TRUE) | grepl("[/\\\\]", x) | grepl("^[A-Za-z]:\\\\", x) } make_se <- function(wide_df, value_start_col, feature_ids, fnames_df, meta, drop_name, modality) { + # Build a SummarizedExperiment from a wide feature x sample table: + # - selects sample columns + # - converts values to numeric + # - drops unwanted sample columns by name pattern + # - attaches sample metadata by joining on (aliquot, cohort) + # - applies extra RNA-specific parsing and metadata reconciliation + # Inputs: + # wide_df: wide data.frame (features + sample columns) + # value_start_col: integer index of first sample column + # feature_ids: character vector of feature IDs (length = nrow(wide_df)) + # fnames_df: data.frame mapping fname->aliquot/cohort (from parse_fnames) + # meta: metadata table used to map aliquot/cohort to Patient/Tumor/Specimen + # drop_name: function(x)->logical; TRUE means drop that sample column + # modality: "phospho", "global", or "rna" + # Output: + # SummarizedExperiment with assay "values" and populated colData/rowData all_candidate <- colnames(wide_df)[value_start_col:ncol(wide_df)] has_pathy <- any(looks_like_sample_header(all_candidate)) if (has_pathy) { @@ -258,8 +380,8 @@ make_se <- function(wide_df, value_start_col, feature_ids, fnames_df, meta, drop sample_cols <- setdiff(all_candidate, c("Site", "Sequence")) } - message(" Candidate sample columns (first 6):") - print(utils::head(sample_cols, 6)) + # message(" Candidate sample columns (first 6):") + # print(utils::head(sample_cols, 6)) message(" Casting measurement block to numeric") raw_block <- wide_df[, sample_cols, drop = FALSE] @@ -350,10 +472,10 @@ make_se <- function(wide_df, value_start_col, feature_ids, fnames_df, meta, drop } message(" - Example cdata rows:") - print(utils::head(cdata[, intersect(c( - "fname","aliquot","cohort","Specimen","Patient","Tumor", - "sample_id","tumor","condition_raw","condition_norm","specimen_norm" - ), names(cdata)), drop = FALSE], 10)) + # print(utils::head(cdata[, intersect(c( + # "fname","aliquot","cohort","Specimen","Patient","Tumor", + # "sample_id","tumor","condition_raw","condition_norm","specimen_norm" + # ), names(cdata)), drop = FALSE], 10)) rn <- cdata$fname cdata_nofname <- dplyr::select(cdata, -fname) @@ -372,12 +494,27 @@ make_se <- function(wide_df, value_start_col, feature_ids, fnames_df, meta, drop } scale_columns_modified_z <- function(m) { + # Apply modified_zscore() to each column of a matrix (per-sample robust scaling). + # Inputs: + # m: numeric matrix (features x samples) + # Output: + # numeric matrix of same dimensions (column-wise robust z-scored) out <- m for (j in seq_len(ncol(m))) out[, j] <- modified_zscore(m[, j]) out } normalize_by_modality <- function(se, modality) { + # Normalize a SummarizedExperiment assay using modality-specific transforms: + # phospho: 0->NA, filter missingness, log2(x+0.01), robust zscore + # global: log2(x), robust zscore + # rna: filter missingness, log2(x+1), robust zscore + # Also collapses duplicated feature IDs. + # Inputs: + # se: SummarizedExperiment with assay "values" + # modality: "phospho", "global", or "rna" + # Output: + # SummarizedExperiment with normalized assay "values" mtype <- tolower(modality) mat0 <- as.matrix(SummarizedExperiment::assay(se, "values")) c0 <- colnames(mat0) @@ -415,6 +552,12 @@ normalize_by_modality <- function(se, modality) { # Function to Combine Data (batches) combine_batches_intersection <- function(se_list) { + # Combine multiple normalized batches by intersecting shared features (rownames), + # then concatenating samples (cbind). Also stacks colData. + # Inputs: + # se_list: list of SummarizedExperiment objects (normalized) + # Output: + # SummarizedExperiment containing combined matrix and combined colData message("Combining batches (intersection of features, then cbind samples)") mats <- lapply(se_list, function(se) as.matrix(SummarizedExperiment::assay(se, "values"))) feats <- Reduce(intersect, lapply(mats, rownames)) @@ -438,15 +581,21 @@ combine_batches_intersection <- function(se_list) { ) message(sprintf(" - Combined assay dims: %d feats × %d samples", nrow(se), ncol(se))) message(" - Head(sample names) in combined assay:") - print(utils::head(colnames(SummarizedExperiment::assay(se, "values")), 6)) + # print(utils::head(colnames(SummarizedExperiment::assay(se, "values")), 6)) message(" - Head(rownames) in combined colData (should match):") - print(utils::head(rownames(SummarizedExperiment::colData(se)), 6)) + # print(utils::head(rownames(SummarizedExperiment::colData(se)), 6)) invisible(se) } # Combat Function. (Lots of messages to help debug) combat_by_cohort <- function(se) { + # Batch-correct the combined matrix using ComBat (sva) with colData$cohort as the + # batch variable. Replaces non-finite values with 0 before correction. + # Inputs: + # se: SummarizedExperiment with assay "values" and colData column "cohort" + # Output: + # SummarizedExperiment with batch-corrected assay "values" message("Running ComBat by cohort (batch-only; mean.only = FALSE)") suppressPackageStartupMessages(library(sva)) @@ -462,7 +611,7 @@ combat_by_cohort <- function(se) { if (!"cohort" %in% names(cd)) stop("colData must contain 'cohort' for ComBat batching.") batch <- droplevels(as.factor(cd$cohort)) - message(" Batch table (pre-drop):"); print(table(batch, useNA = "ifany")) + # message(" Batch table (pre-drop):"); print(table(batch, useNA = "ifany")) keep <- !is.na(batch) if (any(!keep)) { @@ -472,25 +621,32 @@ combat_by_cohort <- function(se) { cd <- cd[keep, , drop = FALSE] } - message(" Final check — ncol(mat)=", ncol(mat), "; length(batch)=", length(batch)) - message(" Batch table (final):"); print(table(batch, useNA = "ifany")) + # message(" Final check — ncol(mat)=", ncol(mat), "; length(batch)=", length(batch)) + # message(" Batch table (final):"); print(table(batch, useNA = "ifany")) pre_by_cohort <- tapply(colMeans(mat), batch, sd) - message(" Pre-ComBat: SD of column means by cohort:"); print(pre_by_cohort) + # message(" Pre-ComBat: SD of column means by cohort:"); print(pre_by_cohort) cb <- sva::ComBat(dat = mat, batch = batch, mean.only = FALSE, par.prior = TRUE) post_by_cohort <- tapply(colMeans(cb), batch, sd) - message(" Post-ComBat: SD of column means by cohort:"); print(post_by_cohort) + # message(" Post-ComBat: SD of column means by cohort:"); print(post_by_cohort) SummarizedExperiment::assay(se, "values") <- cb - message(" Matrix dims after ComBat: ", nrow(cb), " × ", ncol(cb)) + # message(" Matrix dims after ComBat: ", nrow(cb), " × ", ncol(cb)) invisible(se) } # Plot Functions (PCA) + more debug messages se_to_long <- function(se, modality) { + # Convert a SummarizedExperiment matrix into long format (one row per feature-sample + # pair) and join sample metadata from colData. + # Inputs: + # se: SummarizedExperiment with assay "values" + # modality: "phospho", "global", or "rna" (controls feature column name) + # Output: + # data.frame in long format with correctedAbundance + sample metadata columns feature_col <- if (tolower(modality) == "global") "Gene" else "feature_id" avals <- as.data.frame(SummarizedExperiment::assay(se, "values"), check.names = FALSE) @@ -519,6 +675,12 @@ se_to_long <- function(se, modality) { } pca_df_present_in_all <- function(se) { + # Prepare a PCA data.frame using only features that are complete (finite) across + # all samples. Joins PCA scores with sample metadata for plotting. + # Inputs: + # se: SummarizedExperiment with assay "values" + # Output: + # data.frame with PC1/PC2 and metadata columns (Patient/Tumor/Specimen/cohort, etc.) message("Preparing PCA (features present in ALL samples)") mat <- as.matrix(SummarizedExperiment::assay(se, "values")) @@ -548,8 +710,8 @@ pca_df_present_in_all <- function(se) { message(" ! Warning: Patient is NA for all samples after join. Will color/shape by cohort.") df1$Patient_fallback <- as.character(df1$cohort) df1$Tumor_fallback <- as.character(df1$cohort) - message(" - DEBUG: head(df1$fname):"); print(utils::head(df1$fname, 6)) - message(" - DEBUG: head(cd$fname):"); print(utils::head(cd$fname, 6)) + # message(" - DEBUG: head(df1$fname):"); print(utils::head(df1$fname, 6)) + # message(" - DEBUG: head(cd$fname):"); print(utils::head(cd$fname, 6)) } if ("Specimen" %in% names(df1)) { @@ -560,6 +722,14 @@ pca_df_present_in_all <- function(se) { } plot_pca <- function(pc_df, title_text, pcols = NULL) { + # Create a PCA scatter plot (PC1 vs PC2), choosing a sensible color/shape mapping + # based on available metadata (prefers Patient/Tumor; falls back to cohort/condition). + # Inputs: + # pc_df: data.frame returned by pca_df_present_in_all() + # title_text: plot title string + # pcols: optional named vector of colors for Patient values + # Output: + # ggplot object (PCA scatter) color_col <- if ("Patient" %in% names(pc_df) && any(!is.na(pc_df$Patient))) { "Patient" } else if ("condition_norm" %in% names(pc_df) && any(!is.na(pc_df$condition_norm))) { @@ -588,6 +758,13 @@ plot_pca <- function(pc_df, title_text, pcols = NULL) { } plot_hist <- function(se, title_text) { + # Plot a histogram of all assay values, filled by cohort, to visualize distributions + # (e.g., pre- vs post-ComBat). + # Inputs: + # se: SummarizedExperiment with assay "values" + # title_text: plot title string + # Output: + # ggplot object (histogram) cd <- coldata_tbl(se) df <- as.data.frame(SummarizedExperiment::assay(se, "values")) |> tidyr::pivot_longer(everything(), names_to = "fname", values_to = "val") |> @@ -603,6 +780,13 @@ plot_hist <- function(se, title_text) { # Upload function perform_uploads <- function(paths, syn, parent_id) { + # Upload a set of local output files to a Synapse folder/project using syn$store(). + # Inputs: + # paths: character vector of local file paths + # syn: Synapse client object (synapser) + # parent_id: Synapse folder/project ID to store into + # Output: + # invisible(NULL); side-effect is file uploads to Synapse if (is.null(parent_id) || length(paths) == 0) return(invisible(NULL)) message("All steps succeeded — uploading ", length(paths), " file(s) to Synapse…") for (p in paths) { @@ -625,7 +809,31 @@ perform_uploads <- function(paths, syn, parent_id) { # list(syn_id = "syn69947351", cohort = 2, value_start_col = 5, fname_aliquot_index = 9) # ) + run_modality <- function( + # End-to-end normalization pipeline for one modality across one or more batches: + # - read wide tables from Synapse + # - build feature IDs + construct SummarizedExperiment per batch + # - modality-specific normalization per batch + # - combine batches on shared feature intersection + # - QC plots (PCA + hist) pre and post + # - optional ComBat batch correction by cohort + # - export long CSVs and optional Synapse uploads + # Inputs: + # modality: "phospho", "global", or "rna" + # batches: list of batch configs (syn_id, cohort, optional parsing hints) + # meta: sample metadata table for joining + # syn: Synapse client object + # drop_name_substrings: optional regex patterns to drop sample columns + # out_dir: directory for outputs + # out_prefix: base name for outputs (defaults from modality) + # upload_parent_id: Synapse folder/project ID for uploads (optional) + # pcols: optional named color vector for Patient PCA coloring + # write_outputs: TRUE/FALSE to write CSV/PDF and upload + # save_basename: override base output stem + # do_batch_correct: TRUE/FALSE to run ComBat + # Output: + # list containing SE objects, long tables, PCA data, plot objects, and written file paths modality, # Which data type to run: "phospho", "global", or "rna" batches, # List of batch configs (each element should include at least: syn_id, cohort; optionally: value_start_col, fname_aliquot_index) meta, # Sample metadata table used to join batch sample IDs to Patient/Tumor/Specimen - cnF_helper_code.R creates this. diff --git a/02_analyze_modality_correlations.R b/02_analyze_modality_correlations.R index ff3d602..893236c 100644 --- a/02_analyze_modality_correlations.R +++ b/02_analyze_modality_correlations.R @@ -1,5 +1,5 @@ # --------------------------------------------------------------------------- -# 02_analyze_modality.R +# 02_analyze_modality_correlations.R # --------------------------------------------------------------------------- # Purpose # - Given (1) drug response fits and (2) a long-format omics table for one modality, @@ -7,9 +7,19 @@ # plots (drug efficacy/variability + optional heatmap), and computes Spearman # correlations between drug response and molecular features. # -# Main entry +# Main entry (This does both of the individual calls) # - analyze_modality(fits, df_long, sample_col, feature_col, value_col, ...) # +# Individual Calls +# - analyze_drug_response(fits, metric, outdir, heatmap_filename, ...) +# * Drug-only: builds drug_mat and writes 3 plots by default +# - most_efficacious.pdf +# - most_variable.pdf +# - drug heatmap (heatmap_filename) +# - analyze_modality_correlations(df_long, sample_col, feature_col, value_col, drug_mat, ...) +# * Modality-only: builds feat_mat and computes correlations + summary plot +# - cor_features_by_drug.pdf +# # Inputs # - fits: long drug response table with improve_sample_id, improve_drug_id, # dose_response_metric, dose_response_value @@ -43,6 +53,17 @@ dir.create("figs", showWarnings = FALSE) # Helpers make_feature_matrix <- function(df_long, shared_ids, sample_col, feature_col, value_col) { + # Build a sample × feature matrix from a long-format omics table, restricted to a set + # of shared sample IDs. Cleans IDs, drops blank IDs, pivots to wide, fills missing with 0, + # and averages duplicates (mean) per sample-feature pair. + # Inputs: + # df_long: long omics data.frame + # shared_ids: character vector of sample IDs to keep + # sample_col: column name in df_long for sample ID + # feature_col: column name in df_long for feature ID + # value_col: column name in df_long for numeric value + # Output: + # data.frame (wide) with rownames = samples and columns = features df <- df_long %>% ungroup() %>% dplyr::filter(.data[[sample_col]] %in% shared_ids) %>% @@ -94,11 +115,22 @@ make_feature_matrix <- function(df_long, shared_ids, sample_col, feature_col, va } make_drug_matrix <- function( - fits, metric = "uM_viability", - sample_col = "improve_sample_id", - drug_col = "improve_drug_id", - value_col = "dose_response_value", - metric_col = "dose_response_metric" + # Build a sample × drug response matrix from the long drug fits table for a chosen metric + # (e.g., uM_viability). Pivots to wide and averages duplicates (mean). + # Inputs: + # fits: long drug response data.frame + # metric: metric value to select from metric_col (default "uM_viability") + # sample_col: sample ID column name in fits (default "improve_sample_id") + # drug_col: drug ID column name in fits (default "improve_drug_id") + # value_col: response value column name in fits (default "dose_response_value") + # metric_col: metric label column name in fits (default "dose_response_metric") + # Output: + # data.frame (wide) with rownames = samples and columns = drugs + fits, metric = "uM_viability", + sample_col = "improve_sample_id", + drug_col = "improve_drug_id", + value_col = "dose_response_value", + metric_col = "dose_response_metric" ) { fits %>% dplyr::filter(.data[[metric_col]] == metric) %>% @@ -112,8 +144,21 @@ make_drug_matrix <- function( } summarize_drugs <- function( - fits, metric = "uM_viability", metric_col = "dose_response_metric", - outdir = "figs", rotate_x = 45 + # Summarize per-drug response for a selected metric and write two PDF scatter plots: + # (1) "most_efficacious" (low mean viability) and (2) "most_variable" (high SD). + # Inputs: + # fits: long drug response data.frame + # metric: metric to analyze (default "uM_viability") + # metric_col: column holding metric labels (default "dose_response_metric") + # outdir: directory to write PDFs (default "figs") + # rotate_x: x-axis label rotation angle for readability + # Output: + # list with: + # summary: data.frame of per-drug meanResponse, nMeasured, variability + # p_eff: ggplot object for efficacious drugs + # p_var: ggplot object for variable drugs + fits, metric = "uM_viability", metric_col = "dose_response_metric", + outdir = "figs", rotate_x = 45 ) { ds <- fits %>% dplyr::filter(.data[[metric_col]] == metric) %>% @@ -155,6 +200,15 @@ summarize_drugs <- function( } compute_cors <- function(drug_mat, feat_mat, shared_samples = NULL) { + # Compute Spearman correlations between each drug response column and each feature column + # across shared samples. Also computes per-pair p-values (cor.test) when enough data exists + # and applies BH FDR correction. + # Inputs: + # drug_mat: numeric matrix/data.frame (samples × drugs), rownames = sample IDs + # feat_mat: numeric matrix/data.frame (samples × features), rownames = sample IDs + # shared_samples: optional character vector of sample IDs to use; if NULL, uses rowname intersection + # Output: + # tibble/data.frame with columns: drug, feature, cor, pval, fdr, direction if (is.null(shared_samples)) { shared_samples <- base::intersect(rownames(drug_mat), rownames(feat_mat)) } @@ -199,6 +253,17 @@ compute_cors <- function(drug_mat, feat_mat, shared_samples = NULL) { } summarize_correlated_features <- function(cor_tbl, fdr_thresh = 0.25, outdir = "figs") { + # Summarize significant drug-feature associations by counting how many features are + # significantly correlated with each drug (split by positive/negative direction), + # and write a bar plot PDF. + # Inputs: + # cor_tbl: correlation table from compute_cors() + # fdr_thresh: significance threshold on FDR (default 0.25) + # outdir: directory to write the PDF (default "figs") + # Output: + # list with: + # summary: tibble of per-drug counts and mean correlation by direction + # plot: ggplot object (or NULL if no significant results) if (nrow(cor_tbl) == 0L) return(list(summary = tibble(), plot = NULL)) corsummary <- cor_tbl %>% dplyr::filter(is.finite(.data$fdr), !is.na(.data$fdr), .data$fdr < fdr_thresh) %>% @@ -221,30 +286,30 @@ summarize_correlated_features <- function(cor_tbl, fdr_thresh = 0.25, outdir = " list(summary = corsummary, plot = p) } + # --------------------------- -# Main wrapper +# Drug-only analysis stage # --------------------------- -analyze_modality <- function( - fits, - df_long, - sample_col, # e.g., "Specimen" - feature_col, # e.g., "feature_id" | "Gene" | "site" - value_col, # e.g., "correctedAbundance" - metric = "uM_viability", # Or fit_auc - outdir = "figs", - heatmap_filename = "drug_heatmap_large.pdf", - fdr_thresh = 0.25 +analyze_drug_response <- function( + # Drug-only workflow: + # - builds sample x drug matrix for a metric + # - writes drug summary plots (most_efficacious, most_variable) + # - writes drug heatmap (by default) for drugs measured in all samples + # Inputs: + # fits: long drug response table + # metric: drug response metric to analyze (default "uM_viability") + # outdir: output directory for plots (default "figs") + # heatmap_filename: filename for drug heatmap PDF; set NULL to skip + # Output: + # list containing: + # drug_mat, drug_summary, p_eff, p_var + fits, + metric = "uM_viability", + outdir = "figs", + heatmap_filename = "drug_heatmap_large.pdf" ) { - shared_ids <- base::intersect(unique(fits$improve_sample_id), unique(df_long[[sample_col]])) - - feat_mat <- make_feature_matrix( - df_long = df_long, - shared_ids = shared_ids, - sample_col = sample_col, - feature_col = feature_col, - value_col = value_col - ) + dir.create(outdir, showWarnings = FALSE, recursive = TRUE) # Drug matrix for the metric drug_mat <- make_drug_matrix( @@ -256,11 +321,12 @@ analyze_modality <- function( metric_col = "dose_response_metric" ) - # Summaries & heatmap + # Summaries (writes most_efficacious.pdf + most_variable.pdf) dsum <- summarize_drugs( fits, metric = metric, metric_col = "dose_response_metric", outdir = outdir ) + # Heatmap (by default) if (!is.null(heatmap_filename) && nrow(drug_mat) > 0 && ncol(drug_mat) > 0) { fulldrugs <- dsum$summary %>% dplyr::filter(.data$nMeasured == nrow(drug_mat)) %>% @@ -279,6 +345,55 @@ analyze_modality <- function( } } + list( + drug_mat = drug_mat, + drug_summary = dsum$summary, + p_eff = dsum$p_eff, + p_var = dsum$p_var + ) +} + + +# --------------------------- +# Modality-only analysis stage +# --------------------------- +analyze_modality_correlations <- function( + # Modality-only workflow: + # - aligns samples shared between drug_mat and omics table + # - builds sample x feature matrix + # - computes drug-feature Spearman correlations + p-values + FDR + # - summarizes significant features per drug and writes a summary plot + # Inputs: + # df_long: long omics table for one modality + # sample_col: sample ID column name in df_long (e.g., "Specimen") + # feature_col: feature ID column name in df_long (e.g., "feature_id" or "Gene") + # value_col: numeric value column name in df_long (e.g., "correctedAbundance") + # drug_mat: sample x drug matrix (rownames = sample IDs) + # outdir: output directory for plots (default "figs") + # fdr_thresh: FDR cutoff used for correlation summary (default 0.25) + # Output: + # list containing feat_mat, cor_tbl, cor_summary, cor_plot, shared_ids + df_long, + sample_col, + feature_col, + value_col, + drug_mat, + outdir = "figs", + fdr_thresh = 0.25 +) { + + dir.create(outdir, showWarnings = FALSE, recursive = TRUE) + + shared_ids <- base::intersect(rownames(drug_mat), unique(df_long[[sample_col]])) + + feat_mat <- make_feature_matrix( + df_long = df_long, + shared_ids = shared_ids, + sample_col = sample_col, + feature_col = feature_col, + value_col = value_col + ) + # Correlations shared_after <- base::intersect(rownames(drug_mat), rownames(feat_mat)) cor_tbl <- if (length(shared_after) > 0L) { @@ -290,12 +405,70 @@ analyze_modality <- function( cor_res <- summarize_correlated_features(cor_tbl, fdr_thresh = fdr_thresh, outdir = outdir) list( - drug_mat = drug_mat, feat_mat = feat_mat, shared_ids = shared_ids, cor_tbl = cor_tbl, cor_summary = cor_res$summary, - cor_plot = cor_res$plot, - drug_summary = dsum$summary + cor_plot = cor_res$plot + ) +} + + +# --------------------------- +# Main wrapper (backwards compatible) +# --------------------------- +analyze_modality <- function( + # End-to-end wrapper for running one omics modality: + # - runs drug-only analysis (drug_mat + drug plots + heatmap) + # - runs modality-only correlations (feat_mat + cor_tbl + summary plot) + # Inputs: + # fits: long drug response table (must include improve_sample_id, improve_drug_id, dose_response_metric, dose_response_value) + # df_long: long omics table for one modality + # sample_col: sample ID column name in df_long (e.g., "Specimen") + # feature_col: feature ID column name in df_long (e.g., "feature_id" or "Gene") + # value_col: numeric value column name in df_long (e.g., "correctedAbundance") + # metric: drug response metric to analyze (default "uM_viability") + # outdir: output directory for plots (default "figs") + # heatmap_filename: filename for drug heatmap PDF; set NULL to skip + # fdr_thresh: FDR cutoff used for correlation summary (default 0.25) + # Output: + # list containing matrices, correlation results, summaries, and (optionally) plot objects: + # drug_mat, feat_mat, shared_ids, cor_tbl, cor_summary, cor_plot, drug_summary + fits, + df_long, + sample_col, # e.g., "Specimen" + feature_col, # e.g., "feature_id" | "Gene" | "site" + value_col, # e.g., "correctedAbundance" + metric = "uM_viability", # Or fit_auc + outdir = "figs", + heatmap_filename = "drug_heatmap_large.pdf", + fdr_thresh = 0.25 +) { + + drug_res <- analyze_drug_response( + fits = fits, + metric = metric, + outdir = outdir, + heatmap_filename = heatmap_filename + ) + + mod_res <- analyze_modality_correlations( + df_long = df_long, + sample_col = sample_col, + feature_col = feature_col, + value_col = value_col, + drug_mat = drug_res$drug_mat, + outdir = outdir, + fdr_thresh = fdr_thresh + ) + + list( + drug_mat = drug_res$drug_mat, + feat_mat = mod_res$feat_mat, + shared_ids = mod_res$shared_ids, + cor_tbl = mod_res$cor_tbl, + cor_summary = mod_res$cor_summary, + cor_plot = mod_res$cor_plot, + drug_summary = drug_res$drug_summary ) } diff --git a/03_leapr_biomarker.R b/03_leapr_biomarker.R index 9658046..32c81e4 100644 --- a/03_leapr_biomarker.R +++ b/03_leapr_biomarker.R @@ -51,12 +51,17 @@ suppressPackageStartupMessages({ # ----------------------------- # Helpers # ----------------------------- -# Long to wide matrix (rows = samples, cols = features) -# df_long columns: -# sample_col : sample IDs (e.g., "Specimen") -# feature_col : feature IDs (e.g., "feature_id", "Gene", "site") depending on input group -# value_col : numeric values (e.g., "correctedAbundance") long_to_matrix <- function(df_long, sample_col, feature_col, value_col) { + # Convert a long-format omics table into a numeric matrix (rows = samples, cols = features). + # Cleans sample/feature IDs, drops NA/blank IDs, pivots to wide, fills missing with 0, + # and averages duplicates (mean) per sample-feature pair. + # Inputs: + # df_long: long omics data.frame + # sample_col: column name in df_long for sample IDs + # feature_col: column name in df_long for feature IDs + # value_col: column name in df_long for numeric values + # Output: + # numeric matrix with rownames = sample IDs and colnames = feature IDs, or NULL if empty if (is.null(df_long) || !nrow(df_long)) return(NULL) df <- df_long |> @@ -115,6 +120,13 @@ long_to_matrix <- function(df_long, sample_col, feature_col, value_col) { # ---- PHOSPHO .extract_gene_from_site <- function(site_id) { + # Extract a gene symbol from a phosphosite/site identifier string. Primarily uses the + # substring before the first '-' (e.g., "AAAS-S495s" -> "AAAS"); falls back to splitting + # on common delimiters if needed. + # Inputs: + # site_id: character scalar (site/feature ID) + # Output: + # character scalar gene symbol (uppercase) or NA if not parseable if (is.na(site_id) || site_id == "") return(NA_character_) x <- as.character(site_id) @@ -134,6 +146,14 @@ long_to_matrix <- function(df_long, sample_col, feature_col, value_col) { # Build phospho site gene map from long table .build_phospho_gene_map_from_long <- function(df_long, feature_col) { + # Build a mapping from phosphosite IDs to gene symbols using columns present in the + # long table (preferred). If no suitable gene column exists, falls back to parsing gene + # symbols from the site IDs themselves. + # Inputs: + # df_long: long omics data.frame + # feature_col: column name in df_long containing phosphosite IDs + # Output: + # named character vector mapping site -> gene, or NULL if no sites found gene_cols <- c("Gene","gene","hgnc_id","hgnc_symbol","protein","Protein","Symbol","symbol") has <- gene_cols[gene_cols %in% colnames(df_long)] if (length(has)) { @@ -157,6 +177,14 @@ long_to_matrix <- function(df_long, sample_col, feature_col, value_col) { .collapse_sites_to_genes <- function(cor_named_vec, map_site2gene, agg = c("mean","maxabs")) { + # Collapse a named vector of site-level correlation values to gene-level values using a + # site->gene mapping. Supports aggregation by mean or by the max absolute correlation per gene. + # Inputs: + # cor_named_vec: named numeric vector (names = site IDs, values = correlations) + # map_site2gene: named character vector mapping site -> gene + # agg: "mean" or "maxabs" (how to aggregate multiple sites per gene) + # Output: + # named numeric vector (names = genes, values = aggregated correlations) agg <- match.arg(agg) if (is.null(map_site2gene) || !length(cor_named_vec)) return(cor_named_vec) @@ -186,6 +214,12 @@ long_to_matrix <- function(df_long, sample_col, feature_col, value_col) { # ---- Normalize phosphosite IDs to match kinasesubstrates (e.g. "AAAS-S495s" -> "AAAS-S495") .normalize_kinase_site_id <- function(x) { + # Normalize phosphosite IDs to better match leapR kinasesubstrates formatting by trimming + # trailing lowercase letters (e.g., "AAAS-S495s" -> "AAAS-S495"). + # Inputs: + # x: character vector of phosphosite IDs + # Output: + # character vector of normalized site IDs x <- as.character(x) x <- trimws(x) # Drop trailing lowercase letters @@ -194,6 +228,13 @@ long_to_matrix <- function(df_long, sample_col, feature_col, value_col) { # Spearman correlations .col_spearman <- function(vec, mat) { + # Compute Spearman correlation between a drug response vector and each feature column in a + # sample × feature matrix. Uses sample ID intersection and pairwise complete observations. + # Inputs: + # vec: named numeric vector of responses (names = sample IDs) + # mat: numeric matrix (rownames = sample IDs, cols = features) + # Output: + # named numeric vector of correlations (one per feature column; NA where not computable) shared <- intersect(names(vec), rownames(mat)) if (length(shared) < 3) return(setNames(rep(NA_real_, ncol(mat)), colnames(mat))) v <- vec[shared] @@ -208,6 +249,16 @@ long_to_matrix <- function(df_long, sample_col, feature_col, value_col) { # Build SummarizedExperiment to feed into leapR .build_se_from_corvec <- function(cor_named_vec, features_all, col_label, map_to_gene = NULL, assay_label = "proteomics") { + # Build a single-column SummarizedExperiment containing correlation scores for a set of features, + # suitable as input to leapR enrichment functions. Optionally stores a mapped gene ID in rowData. + # Inputs: + # cor_named_vec: named numeric vector of scores (names = feature IDs) + # features_all: character vector of features to include (sets row order and rownames) + # col_label: column/sample label to assign in the SE (e.g., "_TOP") + # map_to_gene: optional named vector mapping feature -> gene ID/symbol (stored as hgnc_id) + # assay_label: assay name label to assign (e.g., "proteomics", "phospho", "rna") + # Output: + # SummarizedExperiment with 1 assay column holding the scores v <- rep(NA_real_, length(features_all)); names(v) <- features_all common <- intersect(names(cor_named_vec), features_all) v[common] <- cor_named_vec[common] @@ -224,12 +275,24 @@ long_to_matrix <- function(df_long, sample_col, feature_col, value_col) { } .safe_leapr <- function(...) { + # Run leapR::leapR() safely: catches errors, prints a readable message, and returns NULL + # instead of stopping the whole pipeline. This is used for Debugging. + # Inputs: + # ...: arguments passed directly to leapR::leapR() + # Output: + # leapR result object/table, or NULL on error tryCatch(leapR::leapR(...), error = function(e) { message("[leapR] ", conditionMessage(e)); NULL }) } # Load a leapR built-in geneset by name .load_leapr_geneset_by_name <- function(name) { + # Load a built-in leapR geneset dataset by name (e.g., "kinasesubstrates", "krbpaths"). + # Validates the name and errors if the dataset cannot be loaded. + # Inputs: + # name: character scalar geneset name + # Output: + # geneset object loaded from the leapR package valid <- c("kinasesubstrates", "ncipid", "krbpaths", "longlist", "shortlist") if (!(name %in% valid)) { stop("Unknown geneset name: '", name, "'. Valid: ", paste(valid, collapse = ", ")) @@ -243,6 +306,13 @@ long_to_matrix <- function(df_long, sample_col, feature_col, value_col) { # Decide default geneset from omic label when no override is provided .default_geneset_for_omic <- function(omic_label) { + # Choose a default geneset database based on the omics label: + # - phospho-like labels -> kinasesubstrates + # - otherwise -> krbpaths + # Inputs: + # omic_label: character scalar describing the modality (e.g., "phospho", "rna", "global") + # Output: + # geneset object to use with leapR ol <- tolower(omic_label) if (ol %in% c("phospho","phosphoproteomics","phosphoprotein","phosphoproteome")) { .load_leapr_geneset_by_name("kinasesubstrates") @@ -255,6 +325,26 @@ long_to_matrix <- function(df_long, sample_col, feature_col, value_col) { # Main # ----------------------------- run_leapr_directional_one_cached <- function( + # For each drug, correlate uM_viability with each omics feature across samples, split features + # into TOP (positive; more resistant) and BOTTOM (negative; more sensitive), then run leapR + # enrichment separately on each direction. Supports phospho-specific site->gene handling, + # optional site-normalization for kinasesubstrates, CSV writing, and caching to .RData. + # Inputs: + # drugs: long drug-response data.frame (must include improve_drug_id, improve_sample_id, dose_response_metric, dose_response_value) + # df_long: long omics data.frame (sample/feature/value columns) + # sample_col: column name in df_long for sample IDs + # feature_col: column name in df_long for feature IDs (gene/site) + # value_col: column name in df_long for numeric measurement + # omic_label: modality label used in assay naming and output filenames (e.g., "rna", "global", "phospho") + # cache_path: file path to save/load cached results (.RData); skipped if always_rerun=TRUE + # write_csvs: TRUE/FALSE; write per-drug TOP/BOTTOM leapR tables to CSV + # always_rerun: TRUE/FALSE; ignore cache and recompute + # min_features: minimum features required to run leapR for TOP/BOTTOM + # test_one: TRUE/FALSE; only run the first drug (debug) + # geneset_name: optional built-in leapR geneset name + # geneset_object: optional geneset object to use directly (overrides geneset_name/default) + # Output: + # named list by drug: res_list[[drug]]$top and res_list[[drug]]$bottom (leapR results or NULL) drugs, # Character vector of drugs to test (IDs/names used by your fits/model) df_long, # Long-format omics table (one row per sample x feature) sample_col, # Column name in df_long containing sample IDs @@ -469,6 +559,16 @@ run_leapr_directional_one_cached <- function( # Plot and save using leapR builtin plotter # ----------------------------- save_leapr_plots <- function( + # Save leapR pathway barplots for TOP (resistant) and BOTTOM (sensitive) results for each drug. + # Supports plotting all drugs or a requested subset (case-insensitive matching). + # Inputs: + # res_list: named list returned by run_leapr_directional_one_cached() + # omic_label: modality label used in plot titles and filenames + # top_n: number of top pathways to plot per direction + # drugs: NULL to plot all; otherwise character vector of drug IDs/names to plot (case-insensitive) + # outdir: output directory for PDF files + # Output: + # invisible(NULL); side-effect is writing PDF plots to outdir res_list, omic_label, top_n = 15, diff --git a/README.md b/README.md index 099bb0f..19f9fdf 100644 --- a/README.md +++ b/README.md @@ -5,5 +5,10 @@ This repository manages the code to analyze the results of drug screens in cNF o The data for this code is hosted on Synapse at http://synapse.org/cnfDrugResponse ## Current analysis -We are collecting omics measurements from cNF organoid samples together with drug response data to identify potential biomarkers of drug response. +We are collecting omics measurements from cNF organoid samples together with drug response data to identify potential biomarkers of drug response. +The current analysis workflow (see `analysis/`) is organized as a small set of notebooks that: +- Join batches and (when needed) batch-correct RNA, global proteomics, and phosphoproteomics (via ComBat), and generate basic QC plots (e.g., PCA). +- Summarize drug response behavior across the cohort (most efficacious / most variable drugs + a cohort-wide drug heatmap). +- Correlate drug response with molecular features per modality (Spearman correlations + FDR), producing per-drug summaries of correlated features. +- Perform pathway enrichment on correlated features (direction-aware enrichment with leapR) to interpret putative mechanisms and biomarkers. diff --git a/analysis/01_run_normalize_omics.Rmd b/analysis/01_run_normalize_omics.Rmd index febb395..45c17a6 100644 --- a/analysis/01_run_normalize_omics.Rmd +++ b/analysis/01_run_normalize_omics.Rmd @@ -1,67 +1,50 @@ --- -title: "run_normalize_omics" -author: "JJ" +title: "Normalize Omics" +author: "Jeremy Jacobson" date: "2025-11-04" -output: html_document +output: + html_document: default + pdf_document: default --- -# Get Helper Scripts -```{r setup, include=FALSE} +### Purpose + +This notebook is designed to process multiple batches from transcriptomics, proteomics, and phosphoproteomics data. + +In its simplest form, it will load the data in from synapse and join it together in a single matrix. Additional options include batch correction (needed for global proteomics and phosphoproteomics), batch correction plots (before and after), and uploading the data back to synapse. + +### Get Helper Scripts + +These scripts contain the functions that we will call below. +- 00_cNF_helper_code.R is a basic script that pulls data and sets colors for samples. +- 01_normalize_batchcorrect_omics.R contains all of the code for retrieving batches from synapse, + performing batch correction, plotting, and uploading. + + +```{r initiate, include=FALSE} knitr::opts_chunk$set(echo = TRUE) -library(synapser) +``` +```{r setup, echo=TRUE, results='hide', message=FALSE, warning=FALSE} +library(synapser) syn <- list(get = synapser::synGet, store = synapser::synStore) # Load helper metadata (00_cNF_helper_code.R defines 'meta' and 'pcols') -source("00_cNF_helper_code.R") +source("../00_cNF_helper_code.R") # Source the pipeline -source("01_normalize_batchcorrect_omics.R") +source("../01_normalize_batchcorrect_omics.R") ``` +## Run Batch Correction/Normalization across RNA, Global and Phospho Data. -# Run batch correction / normalization across phospho, global, and rna samples. +#### Clean Data +Here are the IDs of several samples that were used for protocol optimization but not designed as experimental samples. +We want to remove these from the subsequent batch correction steps as they could skew the "real" results. +- Add to this list if more protocol optimization, contaminated samples or anything else we want to remove is present. ```{r} -# --------------------------------------------------------------------------- -# run_modality() — quick reference -# --------------------------------------------------------------------------- -# Inputs -# - modality: "global", "phospho", or "rna" -# - batches : list of batch configs, e.g. -# list( -# list(syn_id="syn...", cohort=1, value_start_col=5, fname_aliquot_index=8), -# list(syn_id="syn...", cohort=2, value_start_col=5, fname_aliquot_index=9) -# ) -# Required per batch: -# * syn_id : Synapse file ID for the wide feature×sample table -# * cohort : cohort/batch label (used for joining + ComBat) -# * fname_aliquot_index: token index (split on "_") used to parse aliquot from sample headers -# Optional per batch: -# * value_start_col: first sample column index (auto-detected if omitted) -# - meta: sample metadata joined by (aliquot, cohort); used to populate Specimen/Patient/Tumor -# - syn: synapser client -# -# Options -# - drop_name_substrings: regex pattern(s) to drop unwanted sample columns -# - out_dir / out_prefix / save_basename: control output file locations/names -# - write_outputs: write CSV/PDF (+upload) vs return in-memory only -# - upload_parent_id: Synapse folder/project ID for uploads -# - pcols: optional named colors for PCA (Patient -> color) -# - do_batch_correct: run ComBat (TRUE) or skip (FALSE) -# -# What it does -# - Normalizes each batch (per modality), combines batches on shared features, -# optionally runs ComBat by cohort, then exports long tables + PCA/hist plots. -# -# Returns -# - SummarizedExperiments: se_batches, se_combined, se_post (and se_corrected if ComBat ran) -# - Long tables: long_pre, long_post -# - PCA data + plots: pca_df_pre/post, plots -# --------------------------------------------------------------------------- - - # Substrings to drop (These were the protocol optimization samples) drop_subs <- c( "cNF_organoid_DIA_G_02_11Feb25", @@ -74,11 +57,79 @@ drop_subs <- c( ``` -# RNA -```{r} +#### The Main Function + +We use `run_modality()` to perform all of the described steps. + +**Brief Summary**: + +- Input: We assign batches to the synapse files using the `batches` vector along with several other descriptive details / params. +- Output: We get out a combined matrix of data that is (optionally) batch corrected. + +**Reference guide**: + +##### Inputs + +- modality: "global", "phospho", or "rna" +- batches : list of batch configs, e.g. + + - list( + list(syn_id="syn...", cohort=1, value_start_col=5, fname_aliquot_index=8), + list(syn_id="syn...", cohort=2, value_start_col=5, fname_aliquot_index=9) + ) + + - Required per batch: + - syn_id : Synapse file ID for the wide feature×sample table + - cohort : cohort/batch label (used for joining + ComBat) + - fname_aliquot_index: token index (split on "_") used to parse aliquot from sample headers + + - Optional per batch: + - value_start_col: first sample column index (auto-detected if omitted) + +- meta: sample metadata joined by (aliquot, cohort); used to populate Specimen/Patient/Tumor +- syn: synapser client + +##### Options + +- drop_name_substrings: regex pattern(s) to drop unwanted sample columns +- out_dir / out_prefix / save_basename: control output file locations/names +- write_outputs: write CSV/PDF (+upload) vs return in-memory only +- upload_parent_id: Synapse folder/project ID for uploads +- pcols: optional named colors for PCA (Patient -> color) +- do_batch_correct: run ComBat (TRUE) or skip (FALSE) + +##### What it does + +- Normalizes each batch (per modality), combines batches on shared features, optionally runs ComBat by cohort, then exports long tables + PCA/hist plots. + +##### Returns + +- SummarizedExperiments: se_batches, se_combined, se_post (and se_corrected if ComBat ran) +- Long tables: long_pre, long_post +- PCA data + plots: pca_df_pre/post, plots + +### RNA Batches + +I am starting with the RNA batches as these are the simplest ones, not actually requiring batch correction (seen in the plot titled "rna samples" after the code runs). + +I'm going to describe the `batches` vector/argument that is required by the `run_modality` script here. + +- A different version of this is needed when more batches are present, omics changes, or the files are modified. +- Each file (even within omics type) may be formatted slightly differently so `value_start_col` and `fname_aliquot_index` may need to be set differently between batches. + +For the rna_batches vector, files are formatted identically so the `value_start_col` are both set to 3 and `fname_aliquot_index` is NULL. + +- **value_start_col**: First column where data begins. +- **fname_aliquot_index**: aliquot index (not used in RNA) +- **cohort**: this variable is used to set the batch number. + +This code is also written to handle different name formats in the headers such as NF0018.T1.organoid, NF0018.T1.tissue, NF0018.skin, NF0018.T2.organoids, NF0018.T2, NF0018.T3.organoids, NF0018.T3. ie: organoid vs organoids, or mixed up capitalization. + + +```{r rna, message=FALSE, warning=FALSE, fig.height=3.5, fig.width=6, out.width="50%"} rna_batches <- list( - list(syn_id = "syn66352931", cohort = 1, value_start_col = 5, fname_aliquot_index = 6), - list(syn_id = "syn70765053", cohort = 2, value_start_col = 5, fname_aliquot_index = 7) + list(syn_id = "syn66352931", cohort = 1, value_start_col = 3, fname_aliquot_index = NULL), + list(syn_id = "syn70765053", cohort = 2, value_start_col = 3, fname_aliquot_index = NULL) ) rna <- run_modality( @@ -91,17 +142,29 @@ rna <- run_modality( out_prefix = "rna", upload_parent_id = "syn71099587", pcols = pcols, - write_outputs = FALSE, + write_outputs = FALSE, # This has already been run so we don't need to write it again unless something changes. save_basename = "RNA_12_no_batch_correct", do_batch_correct = FALSE #Note this is set to false right now. Doesn't appear to be needed for RNA ) ``` +In the plots above, we see that no batch correction was required or performed. -# Global proteomics -```{r} +### Global Proteomic Batches + +Okay, now we are moving on to global proteomics. This is run in the same functional method as the RNA data. We set our batch info, assign modality and other basic info in `run_modality` and then we run it. Again, we don't upload the data (by default atleast). However, we do batch correct here - the reason why can be seen in the before and after plots. + +Here we do use the `fname_aliquot_index` variable in batches. This essentially splits by underscore and takes the Xth value + +Example for exp2 where fname_aliquot_index is 6: + +- `D:\DMS_WorkDir1\cNF_organoid_exp2_DIA_G_**01**_01May25_Romulus_BEHCoA-25-02-16.mzML`: 01 +- `D:\DMS_WorkDir1\cNF_organoid_exp2_DIA_G_**02**_01May25_Romulus_BEHCoA-25-02-16.mzML`: 02 + + +```{r global, message=FALSE, warning=FALSE, fig.height=3.5, fig.width=6, out.width="50%"} global_batches <- list( - list(syn_id = "syn69947355", cohort = 1, value_start_col = 5, fname_aliquot_index = 6), - list(syn_id = "syn69947352", cohort = 2, value_start_col = 5, fname_aliquot_index = 7) + list(syn_id = "syn69947355", cohort = 1, value_start_col = 5, fname_aliquot_index = 5), + list(syn_id = "syn69947352", cohort = 2, value_start_col = 5, fname_aliquot_index = 6) ) global <- run_modality( @@ -114,19 +177,32 @@ global <- run_modality( out_prefix = "global", upload_parent_id = "syn70078365", pcols = pcols, - write_outputs = FALSE, + write_outputs = FALSE, # This has already been run so we don't need to write it again unless something changes. save_basename = "global_batch12_corrected", do_batch_correct = TRUE ) ``` +Above we can see that the batch correction was successfully performed for the global proteomic data. If anything changes or if new batches are added, we can set write_outputs to TRUE and rerun. -#Phospho Proteomics -```{r} +### Phospho Proteomic Batches + +Finally, we run this same process for Phosphoproteomics. Again the run_modality function is used, although the batch info and params are updated. After this code is run, we will see that batch correction was required and that it ran successfully. + +Again we use the `fname_aliquot_index` variable in batches. This essentially splits by underscore and takes the Xth value + +Example for exp1 where fname_aliquot_index is 8: + +- `I:\UserData\LeDay\Piehowski_orgonoids_Feb25\RawData\1338217_cNF_organoid_DIA_P_**04**_29Jan25_Ned_BEHCoA-25-01-02.raw`: 04 +- `I:\UserData\LeDay\Piehowski_orgonoids_Feb25\RawData\1338241_cNF_organoid_DIA_P_**01**_29Jan25_Ned_BEHCoA-25-01-02.raw`: 01 + + + +```{r phospho, message=FALSE, warning=FALSE, fig.height=3.5, fig.width=6, out.width="50%"} phospho_batches <- list( - list(syn_id = "syn69963552", cohort = 1, value_start_col = 5, fname_aliquot_index = 8), - list(syn_id = "syn69947351", cohort = 2, value_start_col = 5, fname_aliquot_index = 9) + list(syn_id = "syn69963552", cohort = 1, value_start_col = 7, fname_aliquot_index = 8), + list(syn_id = "syn69947351", cohort = 2, value_start_col = 7, fname_aliquot_index = 9) ) phospho <- run_modality( @@ -139,7 +215,7 @@ phospho <- run_modality( out_prefix = "phospho", upload_parent_id = "syn70078365", pcols = pcols, - write_outputs = FALSE, + write_outputs = FALSE, # This has already been run so we don't need to write it again unless something changes. save_basename = "phospho_batch12_corrected", do_batch_correct = TRUE ) diff --git a/analysis/01_run_normalize_omics.html b/analysis/01_run_normalize_omics.html new file mode 100644 index 0000000..e49d4f0 --- /dev/null +++ b/analysis/01_run_normalize_omics.html @@ -0,0 +1,659 @@ + + + + + + + + + + + + + + + +Normalize Omics + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+

Purpose

+

This notebook is designed to process multiple batches from +transcriptomics, proteomics, and phosphoproteomics data.

+

In its simplest form, it will load the data in from synapse and join +it together in a single matrix. Additional options include batch +correction (needed for global proteomics and phosphoproteomics), batch +correction plots (before and after), and uploading the data back to +synapse.

+
+
+

Get Helper Scripts

+

These scripts contain the functions that we will call below. - +00_cNF_helper_code.R is a basic script that pulls data and sets colors +for samples. - 01_normalize_batchcorrect_omics.R contains all of the +code for retrieving batches from synapse, performing batch correction, +plotting, and uploading.

+
library(synapser)
+syn <- list(get = synapser::synGet, store = synapser::synStore)
+
+# Load helper metadata (00_cNF_helper_code.R defines 'meta' and 'pcols')
+source("00_cNF_helper_code.R")
+
+# Source the pipeline
+source("01_normalize_batchcorrect_omics.R")
+
+
+

Run Batch Correction/Normalization across RNA, Global and Phospho +Data.

+
+

Clean Data

+

Here are the IDs of several samples that were used for protocol +optimization but not designed as experimental samples. We want to remove +these from the subsequent batch correction steps as they could skew the +“real” results. - Add to this list if more protocol optimization, +contaminated samples or anything else we want to remove is present.

+
# Substrings to drop (These were the protocol optimization samples)
+drop_subs <- c(
+  "cNF_organoid_DIA_G_02_11Feb25",
+  "cNF_organoid_DIA_G_05_11Feb25",
+  "cNF_organoid_DIA_G_06_11Feb25",
+  "cNF_organoid_DIA_P_02_29Jan25",
+  "cNF_organoid_DIA_P_05_11Feb25",
+  "cNF_organoid_DIA_P_06_11Feb25"
+)
+
+
+

The Main Function

+

We use run_modality() to perform all of the described +steps.

+

Brief Summary:

+
    +
  • Input: We assign batches to the synapse files using the +batches vector along with several other descriptive details +/ params.
  • +
  • Output: We get out a combined matrix of data that is (optionally) +batch corrected.
  • +
+

Reference guide:

+
+
Inputs
+
    +
  • modality: “global”, “phospho”, or “rna”

  • +
  • batches : list of batch configs, e.g.

    +
      +
    • list( list(syn_id=“syn…”, cohort=1, value_start_col=5, +fname_aliquot_index=8), list(syn_id=“syn…”, cohort=2, value_start_col=5, +fname_aliquot_index=9) )

    • +
    • Required per batch:

      +
        +
      • syn_id : Synapse file ID for the wide feature×sample table
      • +
      • cohort : cohort/batch label (used for joining + ComBat)
      • +
      • fname_aliquot_index: token index (split on “_“) used to parse +aliquot from sample headers
      • +
    • +
    • Optional per batch:

      +
        +
      • value_start_col: first sample column index (auto-detected if +omitted)
      • +
    • +
  • +
  • meta: sample metadata joined by (aliquot, cohort); used to +populate Specimen/Patient/Tumor

  • +
  • syn: synapser client

  • +
+
+
+
Options
+
    +
  • drop_name_substrings: regex pattern(s) to drop unwanted sample +columns
  • +
  • out_dir / out_prefix / save_basename: control output file +locations/names
  • +
  • write_outputs: write CSV/PDF (+upload) vs return in-memory only
  • +
  • upload_parent_id: Synapse folder/project ID for uploads
  • +
  • pcols: optional named colors for PCA (Patient -> color)
  • +
  • do_batch_correct: run ComBat (TRUE) or skip (FALSE)
  • +
+
+
+
What it does
+
    +
  • Normalizes each batch (per modality), combines batches on shared +features, optionally runs ComBat by cohort, then exports long tables + +PCA/hist plots.
  • +
+
+
+
Returns
+
    +
  • SummarizedExperiments: se_batches, se_combined, se_post (and +se_corrected if ComBat ran)
  • +
  • Long tables: long_pre, long_post
  • +
  • PCA data + plots: pca_df_pre/post, plots
  • +
+
+
+
+

RNA Batches

+

I am starting with the RNA batches as these are the simplest ones, +not actually requiring batch correction (seen in the plot titled “rna +samples” after the code runs).

+

I’m going to describe the batches vector/argument that +is required by the run_modality script here.

+
    +
  • A different version of this is needed when more batches are present, +omics changes, or the files are modified.
  • +
  • Each file (even within omics type) may be formatted slightly +differently so value_start_col and +fname_aliquot_index may need to be set differently between +batches.
  • +
+

For the rna_batches vector, files are formatted identically so the +value_start_col are both set to 3 and +fname_aliquot_index is NULL.

+
    +
  • value_start_col: First column where data +begins.
  • +
  • fname_aliquot_index: aliquot index (not used in +RNA)
  • +
  • cohort: this variable is used to set the batch +number.
  • +
+

This code is also written to handle different name formats in the +headers such as NF0018.T1.organoid, NF0018.T1.tissue, NF0018.skin, +NF0018.T2.organoids, NF0018.T2, NF0018.T3.organoids, NF0018.T3. ie: +organoid vs organoids, or mixed up capitalization.

+
rna_batches <- list(
+  list(syn_id = "syn66352931", cohort = 1, value_start_col = 3, fname_aliquot_index = NULL),
+  list(syn_id = "syn70765053", cohort = 2, value_start_col = 3, fname_aliquot_index = NULL)
+)
+
+rna <- run_modality(
+  modality = "rna",
+  batches  = rna_batches,
+  meta     = meta,
+  syn      = syn,
+  drop_name_substrings = drop_subs,
+  out_dir          = "rna_test",
+  out_prefix       = "rna",
+  upload_parent_id = "syn71099587",
+  pcols            = pcols,
+  write_outputs    = FALSE,     # This has already been run so we don't need to write it again unless something changes.
+  save_basename    = "RNA_12_no_batch_correct",
+  do_batch_correct = FALSE      #Note this is set to false right now. Doesn't appear to be needed for RNA 
+)
+

+In the plots above, we see that no batch correction was required or +performed.

+
+
+

Global Proteomic Batches

+

Okay, now we are moving on to global proteomics. This is run in the +same functional method as the RNA data. We set our batch info, assign +modality and other basic info in run_modality and then we +run it. Again, we don’t upload the data (by default atleast). However, +we do batch correct here - the reason why can be seen in the before and +after plots.

+

Here we do use the fname_aliquot_index variable in +batches. This essentially splits by underscore and takes the Xth +value

+

Example for exp2 where fname_aliquot_index is 6:

+
    +
  • D:\DMS_WorkDir1\cNF_organoid_exp2_DIA_G_**01**_01May25_Romulus_BEHCoA-25-02-16.mzML: +01
  • +
  • D:\DMS_WorkDir1\cNF_organoid_exp2_DIA_G_**02**_01May25_Romulus_BEHCoA-25-02-16.mzML: +02
  • +
+
global_batches <- list(
+  list(syn_id = "syn69947355", cohort = 1, value_start_col = 5, fname_aliquot_index = 5),
+  list(syn_id = "syn69947352", cohort = 2, value_start_col = 5, fname_aliquot_index = 6)
+)
+
+global <- run_modality(
+  modality = "Global",
+  batches  = global_batches,
+  meta     = meta,
+  syn      = syn,
+  drop_name_substrings = drop_subs,
+  out_dir          = "global_test",
+  out_prefix       = "global",
+  upload_parent_id = "syn70078365",
+  pcols            = pcols,
+  write_outputs    = FALSE,     # This has already been run so we don't need to write it again unless something changes.
+  save_basename    = "global_batch12_corrected",
+  do_batch_correct = TRUE
+)
+

+
## Found 2 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
+

+Above we can see that the batch correction was successfully performed +for the global proteomic data. If anything changes or if new batches are +added, we can set write_outputs to TRUE and rerun.

+
+
+

Phospho Proteomic Batches

+

Finally, we run this same process for Phosphoproteomics. Again the +run_modality function is used, although the batch info and params are +updated. After this code is run, we will see that batch correction was +required and that it ran successfully.

+

Again we use the fname_aliquot_index variable in +batches. This essentially splits by underscore and takes the Xth +value

+

Example for exp1 where fname_aliquot_index is 8:

+
    +
  • I:\UserData\LeDay\Piehowski_orgonoids_Feb25\RawData\1338217_cNF_organoid_DIA_P_**04**_29Jan25_Ned_BEHCoA-25-01-02.raw: +04
  • +
  • I:\UserData\LeDay\Piehowski_orgonoids_Feb25\RawData\1338241_cNF_organoid_DIA_P_**01**_29Jan25_Ned_BEHCoA-25-01-02.raw: +01
  • +
+
phospho_batches <- list(
+  list(syn_id = "syn69963552", cohort = 1, value_start_col = 7, fname_aliquot_index = 8),
+  list(syn_id = "syn69947351", cohort = 2, value_start_col = 7, fname_aliquot_index = 9)
+)
+
+phospho <- run_modality(
+  modality = "Phospho",
+  batches  = phospho_batches,
+  meta     = meta,
+  syn      = syn,
+  drop_name_substrings = drop_subs,
+  out_dir          = "phospho_test",
+  out_prefix       = "phospho",
+  upload_parent_id = "syn70078365",
+  pcols            = pcols,
+  write_outputs    = FALSE, # This has already been run so we don't need to write it again unless something changes.
+  save_basename    = "phospho_batch12_corrected",
+  do_batch_correct = TRUE
+)
+

+
+
+ + + + +
+ + + + + + + + + + + + + + + diff --git a/analysis/02_analyze_modality.Rmd b/analysis/02_analyze_modality.Rmd index 6b4141a..21f777c 100644 --- a/analysis/02_analyze_modality.Rmd +++ b/analysis/02_analyze_modality.Rmd @@ -1,38 +1,91 @@ --- -title: "run_analyze_modality_and_enrichment" -author: "JJ" +title: "Analyze Modality Correlations" +author: "Jeremy Jacobson" date: "2025-11-12" output: html_document --- -```{r setup, include=FALSE} +### Purpose + +This notebook is designed to take the drug viability data and the long-format omics tables (RNA, global, and phospho), +then compute and plot correlations between drug response and molecular features. These long-format omics tables are batch corrected +when necessary and have been returned from the Normalize Omics Notebook. + +The `analyze_modality` will load the inputs from Synapse, build the matrices, save the plots, and return the +correlation results so we can quickly inspect them. This markdown file also uses a `pdf_to_png_if_possible` function to help display the +plots directly in the knitted markdown file. + +### Get Helper Scripts + +These scripts contain the functions that we will call below. + +- 00_cNF_helper_code.R is a basic helper script (Synapse helpers, metadata, and shared utilities). +- 02_analyze_modality_correlations.R contains the code for building matrices, plotting summaries, and computing correlations. + +```{r initiate, include=FALSE} knitr::opts_chunk$set(echo = TRUE) +#Set your directory to the cNFDrugScreening base directory if source is not working +# knitr::opts_knit$set(root.dir = "path_to_cNFDrugScreening") +``` + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) -source("00_cNF_helper_code.R") -source("02_analyze_modality_correlations.R") +source("../00_cNF_helper_code.R") +source("../02_analyze_modality_correlations.R") ``` -# Load in Data -```{r} +## Load in Data -#Drugs -drugs <- readr::read_tsv(synGet("syn69947322")$path) +These are the four main inputs we need. Drug viability plus the three long-format omics tables. -#RNA -rlong <- readr::read_csv(synGet('syn71333780')$path) +```{r load, echo=TRUE, results='hide', message=FALSE, warning=FALSE, cache=TRUE} +# Drugs +drugs <- readr::read_tsv(synGet("syn69947322")$path) -#Global -glong <- readr::read_csv(synGet('syn70078416')$path) +# RNA +rlong <- readr::read_csv(synGet("syn71333780")$path) -#Phospho -plong <- readr::read_csv(synGet('syn70078415')$path) +# Global +glong <- readr::read_csv(synGet("syn70078416")$path) +# Phospho +plong <- readr::read_csv(synGet("syn70078415")$path) ``` - -# RNA -```{r} +## Run correlations per modality + +Below, we run the same workflow three times, once per modality. The only thing that changes is which omics table +we use and which column identifies the feature. + +The RNA step calls `analyze_modality()` (end-to-end) which: +- Builds a sample x drug matrix and a sample x feature matrix. +- Saves drug summary plots to the output directory (most_efficacious, most_variable, and the heatmap). +- Computes Spearman correlations between drug response and molecular features. +- Returns the results as a list so we can inspect them directly in R. + +For global and phospho, we call `analyze_modality_correlations()` only, which: +- Reuses the drug matrix from the RNA run. +- Builds the sample x feature matrix for that modality. +- Computes Spearman correlations between drug response and molecular features. +- Saves the correlation feature summary plot. + +A few plot notes (these get written to `outdir`): +- **most_efficacious.pdf**: drugs with the lowest mean viability across samples. This is basically a "top hits" plot. +- **most_variable.pdf**: drugs with the highest variability across samples. These are the ones that may be more sample-dependent. +- **drug_heatmap_large_viability.pdf**: a large sample-by-drug heatmap, but only for drugs measured in all samples. This is useful for. clustering and spotting outliers. Samples/patients are separated by lines, within each are 1-3 tumors. +- **cor_features_by_drug.pdf**: counts of significant correlated features per drug (split by direction). This is a quick way to see which drugs have real signal in this modality. + +**Note**: `most_efficacious`, `most_variable`, and `drug_heatmap_large_viability` are based on the drug response table, not the omics table. +They get recreated on every end-to-end run, but they are the same plots. Because of that, we run the end-to-end workflow once (RNA), +and then for global/phospho we only run the modality-only correlation step. + +**Note 2:** The end-to-end `analyze_modality` function may take a long time to run (~30-45 min). + +### RNA + +```{r rna, echo=TRUE, results='hide', message=FALSE, warning=FALSE, cache=TRUE} corr_rna <- analyze_modality( fits = drugs, df_long = rlong, @@ -44,35 +97,99 @@ corr_rna <- analyze_modality( outdir = "new_figs" ) +# Copy the shared filenames to RNA-specific names so later modalities don't overwrite what we want to show here +dir.create("new_figs", showWarnings = FALSE, recursive = TRUE) +file.copy("new_figs/most_efficacious.pdf", "new_figs/rna_most_efficacious.pdf", overwrite = TRUE) +file.copy("new_figs/most_variable.pdf", "new_figs/rna_most_variable.pdf", overwrite = TRUE) +file.copy("new_figs/cor_features_by_drug.pdf", "new_figs/rna_cor_features_by_drug.pdf", overwrite = TRUE) ``` + +For RNA, the “features” are the rows we test in the correlation step, and they come directly from the `feature_id` column in the RNA long table (`rlong`). In most cases `feature_id` is a gene ID (or a transcript ID, depending on how the RNA file was generated). The workflow builds a sample × feature matrix from RNA, then correlates every RNA feature against drug viability for each drug. + +If a drug shows a large number of significant RNA correlations (after FDR filtering), that is a strong sign the RNA signal is tracking response for that drug. In other words, the transcriptome is separating the sensitive vs resistant samples for that drug in a consistent way. + +Below are the plots I show right after running RNA. This is the only place I show the drug-only plots, because those are based on the drug response table and are essentially the same no matter which omics table we are using. + +**Positive Correlation** indicates a **higher viability** which means that there is **lower effect** of the drug / **higher resistance** to the drug. + +**Negative Correlation** indicates a **lower viability** which means that there is **greater effect** of the drug / **higher sensitivity** to the drug. + +Metric used: `uM_viability`. + +Files shown for RNA: +- new_figs/rna_most_efficacious.pdf +- new_figs/rna_most_variable.pdf +- new_figs/rna_drug_heatmap_large_viability.pdf +- new_figs/rna_cor_features_by_drug.pdf + + +```{r rna_plots, echo=FALSE, message=FALSE, warning=FALSE, fig.height=6.5, fig.width=10, out.width="95%", cache=TRUE} + +show_plots(c( + "new_figs/rna_most_efficacious.pdf", + "new_figs/rna_most_variable.pdf", + "new_figs/rna_drug_heatmap_large_viability.pdf", + "new_figs/rna_cor_features_by_drug.pdf" +), dpi = 220) +``` -# Global Proteomics -```{r} -corr_global <- analyze_modality( - fits = drugs, +### Global Proteomics + +```{r global, echo=TRUE, results='hide', message=FALSE, warning=FALSE, cache=TRUE} +corr_global <- analyze_modality_correlations( df_long = glong, sample_col = "Specimen", feature_col = "Gene", value_col = "correctedAbundance", - metric = "uM_viability", - heatmap_filename = "global_drug_heatmap_large_viability.pdf", - outdir = "new_figs" + drug_mat = corr_rna$drug_mat, + outdir = "new_figs", + fdr_thresh = 0.25 ) + +# Copy the shared filenames to global-specific names +file.copy("new_figs/cor_features_by_drug.pdf", "new_figs/global_cor_features_by_drug.pdf", overwrite = TRUE) ``` +For global proteomics, the features are proteins. In theory, these may be closer to the phenotype than RNA. -# Phospho Proteomics -```{r} -corr_phospho <- analyze_modality( - fits = drugs, +Global correlation plot + +```{r global_plots, echo=FALSE, message=FALSE, warning=FALSE, fig.height=6.5, fig.width=10, out.width="95%", cache=TRUE} +show_plots(c( + "new_figs/global_cor_features_by_drug.pdf" +), dpi = 220) +``` + +### Phospho Proteomics + +```{r phospho, echo=TRUE, results='hide', message=FALSE, warning=FALSE, cache=TRUE} +corr_phospho <- analyze_modality_correlations( df_long = plong, sample_col = "Specimen", feature_col = "site", value_col = "correctedAbundance", - metric = "uM_viability", - heatmap_filename = "phospho_drug_heatmap_large_viability.pdf", - outdir = "new_figs" + drug_mat = corr_rna$drug_mat, + outdir = "new_figs", + fdr_thresh = 0.25 ) +# Copy the shared filenames to phospho-specific names +file.copy("new_figs/cor_features_by_drug.pdf", "new_figs/phospho_cor_features_by_drug.pdf", overwrite = TRUE) +``` + +For phospho, the features are phosphorylation sites. It is normal for this to be noisier and higher-dimensional, but when a drug shows a cluster of correlated sites, it can be really informative for the mechanism - however, there is just one correlation here with 2 features. + +Here is the phospho correlation plot - for some reason, this one doesn't always load, so I'm adding the table too. + +```{r phospho_plots, echo=FALSE, message=FALSE, warning=FALSE, fig.height=6.5, fig.width=10, out.width="95%", cache=TRUE} +show_plots(c( + "new_figs/phospho_cor_features_by_drug.pdf" +), dpi = 220) +``` + + + +```{r} +corr_phospho$cor_summary ``` diff --git a/analysis/02_analyze_modality.html b/analysis/02_analyze_modality.html new file mode 100644 index 0000000..316d8f5 --- /dev/null +++ b/analysis/02_analyze_modality.html @@ -0,0 +1,591 @@ + + + + + + + + + + + + + + + +Analyze Modality Correlations + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+

Purpose

+

This notebook is designed to take the drug viability data and the +long-format omics tables (RNA, global, and phospho), then compute and +plot correlations between drug response and molecular features. These +long-format omics tables are batch corrected when necessary and have +been returned from the Normalize Omics Notebook.

+

The analyze_modality will load the inputs from Synapse, +build the matrices, save the plots, and return the correlation results +so we can quickly inspect them. This markdown file also uses a +pdf_to_png_if_possible function to help display the plots +directly in the knitted markdown file.

+
+
+

Get Helper Scripts

+

These scripts contain the functions that we will call below.

+
    +
  • 00_cNF_helper_code.R is a basic helper script (Synapse helpers, +metadata, and shared utilities).
  • +
  • 02_analyze_modality_correlations.R contains the code for building +matrices, plotting summaries, and computing correlations.
  • +
+
+
+

Load in Data

+

These are the four main inputs we need. Drug viability plus the three +long-format omics tables.

+
# Drugs
+drugs <- readr::read_tsv(synGet("syn69947322")$path)
+
+# RNA 
+rlong <- readr::read_csv(synGet("syn71333780")$path)
+
+# Global
+glong <- readr::read_csv(synGet("syn70078416")$path)
+
+# Phospho
+plong <- readr::read_csv(synGet("syn70078415")$path)
+
+
+

Run correlations per modality

+

Below, we run the same workflow three times, once per modality. The +only thing that changes is which omics table we use and which column +identifies the feature.

+

The RNA step calls analyze_modality() (end-to-end) +which:
+- Builds a sample x drug matrix and a sample x feature matrix.
+- Saves drug summary plots to the output directory (most_efficacious, +most_variable, and the heatmap).
+- Computes Spearman correlations between drug response and molecular +features.
+- Returns the results as a list so we can inspect them directly in +R.

+

For global and phospho, we call +analyze_modality_correlations() only, which:
+- Reuses the drug matrix from the RNA run.
+- Builds the sample x feature matrix for that modality.
+- Computes Spearman correlations between drug response and molecular +features.
+- Saves the correlation feature summary plot.

+

A few plot notes (these get written to outdir):
+- most_efficacious.pdf: drugs with the lowest mean +viability across samples. This is basically a “top hits” plot.
+- most_variable.pdf: drugs with the highest variability +across samples. These are the ones that may be more +sample-dependent.
+- drug_heatmap_large_viability.pdf: a large +sample-by-drug heatmap, but only for drugs measured in all samples. This +is useful for. clustering and spotting outliers. Samples/patients are +separated by lines, within each are 1-3 tumors.
+- cor_features_by_drug.pdf: counts of significant +correlated features per drug (split by direction). This is a quick way +to see which drugs have real signal in this modality.

+

Note: most_efficacious, +most_variable, and +drug_heatmap_large_viability are based on the drug response +table, not the omics table.
+They get recreated on every end-to-end run, but they are the same plots. +Because of that, we run the end-to-end workflow once (RNA), and then for +global/phospho we only run the modality-only correlation step.

+

Note 2: The end-to-end analyze_modality +function may take a long time to run (~30-45 min).

+
+

RNA

+
corr_rna <- analyze_modality(
+  fits        = drugs,            
+  df_long     = rlong,           
+  sample_col  = "Specimen",
+  feature_col = "feature_id",
+  value_col   = "correctedAbundance",
+  metric      = "uM_viability",
+  heatmap_filename = "rna_drug_heatmap_large_viability.pdf",
+  outdir = "new_figs"
+)
+
+# Copy the shared filenames to RNA-specific names so later modalities don't overwrite what we want to show here
+dir.create("new_figs", showWarnings = FALSE, recursive = TRUE)
+file.copy("new_figs/most_efficacious.pdf", "new_figs/rna_most_efficacious.pdf", overwrite = TRUE)
+file.copy("new_figs/most_variable.pdf",   "new_figs/rna_most_variable.pdf",   overwrite = TRUE)
+file.copy("new_figs/cor_features_by_drug.pdf", "new_figs/rna_cor_features_by_drug.pdf", overwrite = TRUE)
+

For RNA, the “features” are the rows we test in the correlation step, +and they come directly from the feature_id column in the +RNA long table (rlong). In most cases +feature_id is a gene ID (or a transcript ID, depending on +how the RNA file was generated). The workflow builds a sample × feature +matrix from RNA, then correlates every RNA feature against drug +viability for each drug.

+

If a drug shows a large number of significant RNA correlations (after +FDR filtering), that is a strong sign the RNA signal is tracking +response for that drug. In other words, the transcriptome is separating +the sensitive vs resistant samples for that drug in a consistent +way.

+

Below are the plots I show right after running RNA. This is the only +place I show the drug-only plots, because those are based on the drug +response table and are essentially the same no matter which omics table +we are using.

+

Positive Correlation indicates a higher +viability which means that there is lower +effect of the drug / higher resistance to the +drug.

+

Negative Correlation indicates a lower +viability which means that there is greater +effect of the drug / higher sensitivity to the +drug.

+

Metric used: uM_viability.

+

Files shown for RNA:
+- new_figs/rna_most_efficacious.pdf
+- new_figs/rna_most_variable.pdf
+- new_figs/rna_drug_heatmap_large_viability.pdf
+- new_figs/rna_cor_features_by_drug.pdf

+

+
+
+

Global Proteomics

+
corr_global <- analyze_modality_correlations(
+  df_long     = glong,           
+  sample_col  = "Specimen",
+  feature_col = "Gene",
+  value_col   = "correctedAbundance",
+  drug_mat    = corr_rna$drug_mat,
+  outdir      = "new_figs",
+  fdr_thresh  = 0.25
+)
+
+# Copy the shared filenames to global-specific names
+file.copy("new_figs/cor_features_by_drug.pdf", "new_figs/global_cor_features_by_drug.pdf", overwrite = TRUE)
+

For global proteomics, the features are proteins. In theory, these +may be closer to the phenotype than RNA.

+

Global correlation plot

+

+
+
+

Phospho Proteomics

+
corr_phospho <- analyze_modality_correlations(
+  df_long     = plong,           
+  sample_col  = "Specimen",
+  feature_col = "site",
+  value_col   = "correctedAbundance",
+  drug_mat    = corr_rna$drug_mat,
+  outdir      = "new_figs",
+  fdr_thresh  = 0.25
+)
+
+# Copy the shared filenames to phospho-specific names
+file.copy("new_figs/cor_features_by_drug.pdf", "new_figs/phospho_cor_features_by_drug.pdf", overwrite = TRUE)
+

For phospho, the features are phosphorylation sites. It is normal for +this to be noisier and higher-dimensional, but when a drug shows a +cluster of correlated sites, it can be really informative for the +mechanism - however, there is just one correlation here with 2 +features.

+

Here is the phospho correlation plot - for some reason, this one +doesn’t always load, so I’m adding the table too.

+
## NULL
+
corr_phospho$cor_summary
+
## # A tibble: 5 × 4
+##   drug        direction features meanCor
+##   <chr>       <chr>        <int>   <dbl>
+## 1 Daporinad   pos              1   0.847
+## 2 ML 210      neg              1  -0.853
+## 3 NVP-BEZ235  pos              2   0.864
+## 4 Tofacitinib pos              1   0.868
+## 5 Tovorafenib pos              1   0.891
+
+
+ + + + +
+ + + + + + + + + + + + + + + diff --git a/analysis/03_pathway_enrichment.Rmd b/analysis/03_pathway_enrichment.Rmd index ddf33d4..d866367 100644 --- a/analysis/03_pathway_enrichment.Rmd +++ b/analysis/03_pathway_enrichment.Rmd @@ -1,35 +1,88 @@ --- -title: "run_analyze_modality_and_enrichment" -author: "JJ" +title: "Pathway Enrichment" +author: "Jeremy Jacobson" date: "2025-11-12" output: html_document --- +### Purpose + +This script runs pathway enrichment using leapR, using the drug viability data plus the long-format omics tables (RNA, global, and phospho). +These long-format omics tables are batch corrected when necessary and have been returned from the Normalize Omics Notebook. + +The main idea is: +- Split samples into "sensitive" vs "resistant" for each drug (based on viability). +- Run enrichment on each side. +- Save the enrichment tables and plots so we can quickly scan which pathways show up for which drugs. + +This file is written to be simple and repeatable. It loads the inputs from Synapse, runs the same workflow per modality, and stores results +in a cache file so we do not re-run heavy work unless we need to. + +**Note**: This will take a couple of hours to run the first time. **Do not overwrite the cache** unless completely necessary. + +### Get Helper Scripts + +These scripts contain the functions that we will call below. + +- 00_cNF_helper_code.R is a basic helper script (Synapse helpers, metadata, and shared utilities). +- 03_leapr_biomarker.R contains the leapR wrapper functions, caching, and plotting helpers. + +```{r initiate, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) + +#Set your directory to the cNFDrugScreening base directory if source is not working +# knitr::opts_knit$set(root.dir = "path_to_cNFDrugScreening") +``` + ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) -source("00_cNF_helper_code.R") -source("03_leapr_biomarker.R") +source("../00_cNF_helper_code.R") +source("../03_leapr_biomarker.R") ``` -# Load in Data -```{r} -#Drugs -drugs <- readr::read_tsv(synGet("syn69947322")$path) +## Load in Data -#RNA -rlong <- readr::read_csv(synGet('syn71333780')$path) +These are the four main inputs we need - just like in analyze_modality. Drug viability plus the three long-format omics tables. + +```{r, echo=TRUE, results='hide', message=FALSE, warning=FALSE, cache=TRUE} +# Drugs +drugs <- readr::read_tsv(synGet("syn69947322")$path) -#Global -glong <- readr::read_csv(synGet('syn70078416')$path) +# RNA +rlong <- readr::read_csv(synGet("syn71333780")$path) -#Phospho -plong <- readr::read_csv(synGet('syn70078415')$path) +# Global +glong <- readr::read_csv(synGet("syn70078416")$path) +# Phospho +plong <- readr::read_csv(synGet("syn70078415")$path) ``` -# RNA -```{r warning=FALSE} +## Run Pathway Enrichment for Each Omics Modality + +Below, we run the same workflow three times, once per modality. The main things that change are: +- which omics table we use +- which column represents the feature +- which geneset library we test (krbpaths vs kinasesubstrates) + +Each call to `run_leapr_directional_one_cached()` does a few things: +- Splits samples into "sensitive" vs "resistant" for each drug (based on viability). +- Runs enrichment for each drug and direction. +- Writes CSV outputs (if `write_csvs = TRUE`). +- Saves an .Rdata cache so future runs are fast unless inputs or settings change. + +One note about plots: `run_leapr_directional_one_cached()` computes the enrichment results, but it does not automatically save the pathway barplots. +Those are generated by `save_leapr_plots()`. In this Rmd, I intentionally save plots for just **mirda** and **olaparib** so we do not generate hundreds of drug PDFs by accident. + +**So, If you want more plots, just add more drugs to the `drugs = c(...)` list for each modality.** + +### RNA (krbpaths). + +For RNA, the features come from the `feature_id` column in the RNA long table (`rlong`). In most cases, this is a gene or transcript ID. +Here we use the `krbpaths` gene set collection. + +```{r rna, echo=TRUE, results='hide', message=FALSE, warning=FALSE, cache=TRUE} res_bio_rna <- run_leapr_directional_one_cached( drugs = drugs, df_long = rlong, @@ -37,21 +90,41 @@ res_bio_rna <- run_leapr_directional_one_cached( feature_col = "feature_id", value_col = "correctedAbundance", omic_label = "rna", - cache_path = "leapR_RNA_krbpaths_enrichment_direction_split_test.Rdata", + cache_path = "../leapR_RNA_krbpaths_enrichment_direction_split.Rdata", write_csvs = TRUE, always_rerun = FALSE, test_one = FALSE, geneset_name = "krbpaths" ) -# Save plots to figs/pathways___{resistant|sensitive}_{top_n}.pdf -# save_leapr_plots(res_bio_rna, omic_label = "rna", drugs = c("Mirdametinib", "olaparib"), top_n = 15) +# This is where pathway plot PDFs get created. +# I keep the list short on purpose so the report stays readable. +save_leapr_plots( + res_bio_rna, + omic_label = "rna", + drugs = c("Mirdametinib", "olaparib"), + top_n = 15 +) +``` + +Below are the RNA pathway plots created by `save_leapr_plots()`. These files get written to `figs/`. + +```{r rna_plots, echo=FALSE, message=FALSE, warning=FALSE, fig.height=6.5, fig.width=10, out.width="50%", cache=TRUE} +rna_plot_paths <- c( + "figs/pathways_Mirdametinib_rna_resistant_top15.pdf", + "figs/pathways_Mirdametinib_rna_sensitive_top15.pdf", + "figs/pathways_olaparib_rna_resistant_top15.pdf", + "figs/pathways_olaparib_rna_sensitive_top15.pdf" +) +show_plots(rna_plot_paths, dpi = 240) ``` -# Global Proteomics -```{r warning=FALSE} +### Global Proteomics (krbpaths) +For global proteomics, the features are proteins in the `Gene` column. We keep the same `krbpaths` gene set collection so RNA and global are easier to compare. + +```{r global, echo=TRUE, results='hide', message=FALSE, warning=FALSE, cache=TRUE} res_bio_global <- run_leapr_directional_one_cached( drugs = drugs, df_long = glong, @@ -59,19 +132,40 @@ res_bio_global <- run_leapr_directional_one_cached( feature_col = "Gene", value_col = "correctedAbundance", omic_label = "global", - cache_path = "leapR_Global_krbpaths_enrichment_direction_split_test.Rdata", + cache_path = "../leapR_Global_krbpaths_enrichment_direction_split.Rdata", write_csvs = TRUE, always_rerun = FALSE, test_one = FALSE, geneset_name = "krbpaths" ) -# Save plots to figs/pathways___{resistant|sensitive}_{top_n}.pdf -# save_leapr_plots(res_bio_global, omic_label = "global", drugs = c("Mirdametinib", "olaparib"), top_n = 15) +save_leapr_plots( + res_bio_global, + omic_label = "global", + drugs = c("Mirdametinib", "olaparib"), + top_n = 15 +) +``` + +Here are the global pathway plots. Same idea as RNA, just using protein abundance instead of RNA abundance. + +```{r global_plots, echo=FALSE, message=FALSE, warning=FALSE, fig.height=6.5, fig.width=10, out.width="50%", cache=TRUE} +global_plot_paths <- c( + "figs/pathways_Mirdametinib_global_resistant_top15.pdf", + "figs/pathways_Mirdametinib_global_sensitive_top15.pdf", + "figs/pathways_olaparib_global_resistant_top15.pdf", + "figs/pathways_olaparib_global_sensitive_top15.pdf" +) + +show_plots(global_plot_paths, dpi = 240) ``` -# Phospho Proteomics -```{r warning=FALSE} +### Phospho Proteomics (kinasesubstrates) + +For phospho, the features are phosphorylation sites in the `site` column. Here we use a phospho-specific gene set library, `kinasesubstrates`, +because site-level maps more cleanly to kinase activity than to general pathway collections. + +```{r phospho, echo=TRUE, results='hide', message=FALSE, warning=FALSE, cache=TRUE} res_bio_phospho <- run_leapr_directional_one_cached( drugs = drugs, df_long = plong, @@ -79,13 +173,38 @@ res_bio_phospho <- run_leapr_directional_one_cached( feature_col = "site", value_col = "correctedAbundance", omic_label = "phospho", - cache_path = "leapR_Phospho_kinasesubstrates_enrichment_direction_split_test.Rdata", + cache_path = "../leapR_Phospho_kinasesubstrates_enrichment_direction_split.Rdata", write_csvs = TRUE, always_rerun = FALSE, test_one = FALSE, geneset_name = "kinasesubstrates" ) -# Save plots to figs/pathways___{resistant|sensitive}_{top_n}.pdf -# save_leapr_plots(res_bio_phospho, omic_label = "phospho", drugs = c("Mirdametinib", "olaparib"), top_n = 15) +save_leapr_plots( + res_bio_phospho, + omic_label = "phospho", + drugs = c("Mirdametinib", "olaparib"), + top_n = 15 +) +``` + +Here are the phospho pathway plots + +```{r phospho_plots, echo=FALSE, message=FALSE, warning=FALSE, fig.height=6.5, fig.width=10, out.width="50%", cache=TRUE} +phospho_plot_paths <- c( + "figs/pathways_Mirdametinib_phospho_resistant_top15.pdf", + "figs/pathways_Mirdametinib_phospho_sensitive_top15.pdf", + "figs/pathways_olaparib_phospho_resistant_top15.pdf", + "figs/pathways_olaparib_phospho_sensitive_top15.pdf" +) + +show_plots(phospho_plot_paths, dpi = 240) ``` + +## Notes + +- If you want to force a full rerun (ignore cache), set `always_rerun = TRUE`. +- If you only want to test quickly on one drug, set `test_one = TRUE` (useful for debugging). +- If you want plots for more drugs, add them to the `drugs = c(...)` list in each `save_leapr_plots()` call. +- All pathway plot files are saved under `figs/` with names like: + - figs/pathways___{resistant|sensitive}_top.pdf. diff --git a/analysis/03_pathway_enrichment.html b/analysis/03_pathway_enrichment.html new file mode 100644 index 0000000..82e45f6 --- /dev/null +++ b/analysis/03_pathway_enrichment.html @@ -0,0 +1,590 @@ + + + + + + + + + + + + + + + +Pathway Enrichment + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+

Purpose

+

This script runs pathway enrichment using leapR, using the drug +viability data plus the long-format omics tables (RNA, global, and +phospho).
+These long-format omics tables are batch corrected when necessary and +have been returned from the Normalize Omics Notebook.

+

The main idea is:
+- Split samples into “sensitive” vs “resistant” for each drug (based on +viability).
+- Run enrichment on each side.
+- Save the enrichment tables and plots so we can quickly scan which +pathways show up for which drugs.

+

This file is written to be simple and repeatable. It loads the inputs +from Synapse, runs the same workflow per modality, and stores results in +a cache file so we do not re-run heavy work unless we need to.

+

Note: This will take a couple of hours to run the +first time. Do not overwrite the cache unless +completely necessary.

+
+
+

Get Helper Scripts

+

These scripts contain the functions that we will call below.

+
    +
  • 00_cNF_helper_code.R is a basic helper script (Synapse helpers, +metadata, and shared utilities).
    +
  • +
  • 03_leapr_biomarker.R contains the leapR wrapper functions, caching, +and plotting helpers.
  • +
+
+
+

Load in Data

+

These are the four main inputs we need - just like in +analyze_modality. Drug viability plus the three long-format omics +tables.

+
# Drugs
+drugs <- readr::read_tsv(synGet("syn69947322")$path)
+
+# RNA 
+rlong <- readr::read_csv(synGet("syn71333780")$path)
+
+# Global
+glong <- readr::read_csv(synGet("syn70078416")$path)
+
+# Phospho
+plong <- readr::read_csv(synGet("syn70078415")$path)
+
+
+

Run Pathway Enrichment for Each Omics Modality

+

Below, we run the same workflow three times, once per modality. The +main things that change are:
+- which omics table we use - which column represents the feature - which +geneset library we test (krbpaths vs kinasesubstrates)

+

Each call to run_leapr_directional_one_cached() does a +few things:
+- Splits samples into “sensitive” vs “resistant” for each drug (based on +viability).
+- Runs enrichment for each drug and direction.
+- Writes CSV outputs (if write_csvs = TRUE).
+- Saves an .Rdata cache so future runs are fast unless inputs or +settings change.

+

One note about plots: run_leapr_directional_one_cached() +computes the enrichment results, but it does not automatically save the +pathway barplots.
+Those are generated by save_leapr_plots(). In this Rmd, I +intentionally save plots for just mirda and +olaparib so we do not generate hundreds of drug PDFs by +accident.

+

So, If you want more plots, just add more drugs to the +drugs = c(...) list for each modality.

+
+

RNA (krbpaths).

+

For RNA, the features come from the feature_id column in +the RNA long table (rlong). In most cases, this is a gene +or transcript ID. Here we use the krbpaths gene set +collection.

+
res_bio_rna <- run_leapr_directional_one_cached(
+  drugs        = drugs,
+  df_long      = rlong,
+  sample_col   = "Specimen",
+  feature_col  = "feature_id",
+  value_col    = "correctedAbundance",
+  omic_label   = "rna",
+  cache_path   = "leapR_RNA_krbpaths_enrichment_direction_split.Rdata",
+  write_csvs   = TRUE,
+  always_rerun = FALSE,
+  test_one     = FALSE,
+  geneset_name = "krbpaths"
+)
+
+# This is where pathway plot PDFs get created.
+# I keep the list short on purpose so the report stays readable.
+save_leapr_plots(
+  res_bio_rna,
+  omic_label = "rna",
+  drugs = c("Mirdametinib", "olaparib"),
+  top_n = 15
+)
+

Below are the RNA pathway plots created by +save_leapr_plots(). These files get written to +figs/.

+

+
+
+

Global Proteomics (krbpaths)

+

For global proteomics, the features are proteins in the +Gene column. We keep the same krbpaths gene +set collection so RNA and global are easier to compare.

+
res_bio_global <- run_leapr_directional_one_cached(
+  drugs        = drugs,
+  df_long      = glong,
+  sample_col   = "Specimen",
+  feature_col  = "Gene",
+  value_col    = "correctedAbundance",
+  omic_label   = "global",
+  cache_path   = "leapR_Global_krbpaths_enrichment_direction_split.Rdata",
+  write_csvs   = TRUE,
+  always_rerun = FALSE,
+  test_one     = FALSE,
+  geneset_name = "krbpaths"
+)
+
+save_leapr_plots(
+  res_bio_global,
+  omic_label = "global",
+  drugs = c("Mirdametinib", "olaparib"),
+  top_n = 15
+)
+

Here are the global pathway plots. Same idea as RNA, just using +protein abundance instead of RNA abundance.

+

+
+
+

Phospho Proteomics (kinasesubstrates)

+

For phospho, the features are phosphorylation sites in the +site column. Here we use a phospho-specific gene set +library, kinasesubstrates, because site-level maps more +cleanly to kinase activity than to general pathway collections.

+
res_bio_phospho <- run_leapr_directional_one_cached(
+  drugs        = drugs,
+  df_long      = plong,
+  sample_col   = "Specimen",
+  feature_col  = "site",
+  value_col    = "correctedAbundance",
+  omic_label   = "phospho",
+  cache_path   = "leapR_Phospho_kinasesubstrates_enrichment_direction_split.Rdata",
+  write_csvs   = TRUE,
+  always_rerun = FALSE,
+  test_one     = FALSE,
+  geneset_name = "kinasesubstrates"
+)
+
+save_leapr_plots(
+  res_bio_phospho,
+  omic_label = "phospho",
+  drugs = c("Mirdametinib", "olaparib"),
+  top_n = 15
+)
+

Here are the phospho pathway plots

+

+
+
+
+

Notes

+
    +
  • If you want to force a full rerun (ignore cache), set +always_rerun = TRUE.
    +
  • +
  • If you only want to test quickly on one drug, set +test_one = TRUE (useful for debugging).
    +
  • +
  • If you want plots for more drugs, add them to the +drugs = c(...) list in each save_leapr_plots() +call.
    +
  • +
  • All pathway plot files are saved under figs/ with names +like: +
      +
    • figs/pathways_{resistant|sensitive}_top.pdf.
    • +
  • +
+
+ + + + +
+ + + + + + + + + + + + + + + diff --git a/analysis/README.md b/analysis/README.md index 6d6edf3..8b6c123 100644 --- a/analysis/README.md +++ b/analysis/README.md @@ -1,504 +1,40 @@ - - - # cNF multi-omics analysis pipeline overview -Purpose: Reference for running the cNF analysis end-to-end and understanding what each stage produces. - -- Run order: Lists the recommended notebook/script sequence and what each file sources (dependencies). - -- Data processing: Summarizes cohort-wise preprocessing, normalization, and ComBat batch correction for RNA, global proteomics and phosphoproteomics. - -- Exploratory outputs: Describes the PCA figures generated from batch-corrected data (global + phospho). - -- Biomarker discovery: Outlines how drug response matrices and omics feature matrices are constructed, filtered to shared samples, and correlated. - -- Pathway interpretation: Summarizes direction-aware enrichment (leapR / GSEA) and the key plots produced across drugs, modalities, and metrics. - +Purpose: Reference for running the cNF analysis end-to-end and what each stage produces. +The notebooks (`.Rmd`) are the primary analysis entry points; the `.R` scripts are sourced helpers. +The `.html` files are the knitted R markdown files from the latest run. + ## Quick run order (and what each file sources) 1) **01_run_normalize_omics.Rmd** - - *Sources:* `cNF_helper_code.R`, `01_normalize_batchcorrect_omics.R` - - Runs per-modality normalization and ComBat; writes long tables/plots if enabled. + - *Sources:* `../00_cNF_helper_code.R`, `../01_normalize_batchcorrect_omics.R` + - *Goal:* Per-modality preprocessing/normalization and batch correction (when needed). + - *Outputs (examples):* + - Batch-corrected long tables written to synapse. + - PCA/QC plots for before and after ComBat batch correction (e.g., `globalCorrectedPCA.pdf`, `phosphoCorrectedPCA.pdf`) 2) **02_analyze_modality.Rmd** - - *Sources:* `cNF_helper_code.R`, `02_analyze_modality_correlations.R` - - Builds drug/feature matrices, modality correlations. + - *Sources:* `../00_cNF_helper_code.R`, `../02_analyze_modality_correlations.R` + - *Goal:* Builds the drug response matrix once, then runs per-modality correlations using the long-format omics tables. + - *Outputs (written to `outdir`, examples):* + - Drug-only plots (written once, based on drug response table): + - `most_efficacious.pdf` + - `most_variable.pdf` + - `drug_heatmap_large_viability.pdf` (heatmap of drugs measured in all samples) + - Modality correlation summary plots (written for each omics type): + - `_cor_features_by_drug.pdf` + - *Notes:* + - RNA typically runs the end-to-end wrapper (`analyze_modality()`), producing drug-only + modality outputs. + - Global/phospho typically run modality-only (`analyze_modality_correlations()`), reusing the RNA-derived `drug_mat`. 3) **03_pathway_enrichment.Rmd** - - *Sources:* `cNF_helper_code.R`, `03_leapr_biomarker.R` - - LeapR pathway enrichment and plots for all. - - -## Normalized Global / Phosphoproteomics Data for Clustering via PCA - -The datasets analyzed in this study included both global proteomics and phosphoproteomics -measurements collected from two experimental cohorts of cNF (cutaneous neurofibroma) organoid -samples. Each cohort comprised multiple tumor specimens derived from different patients. - -For each dataset (global proteins and phosphosites), raw intensity values were preprocessed to -ensure comparability across cohorts: - -- **Missing values and zeros:** Zero measurements were replaced with missing values (NA) to prevent skew during normalization. -- **Feature filtering:** Proteins or phosphosites absent in more than 50% of samples were removed to reduce noise and improve robustness. -- **Normalization:** Remaining values were log2-transformed with a small offset (to prevent log(0)/undefined) and scaled using a modified z-score (median centering and median absolute deviation scaling). This approach places samples on a common scale while down-weighting outliers. -- **Cohort separation:** Each cohort was processed independently to account for technical differences, then merged into a single combined dataset. -- **Batch correction:** To address systematic differences between cohorts, ComBat (sva R package) was used, producing batch-adjusted abundance matrices for both global proteomics and phosphoproteomics. - -The resulting normalized and corrected data provided a harmonized view of protein and phosphosite -abundance across all patients and cohorts, enabling joint exploratory analysis such as PCA (Figures: -phosphoCorrectedPCA.pdf and globalCorrectedPCA.pdf). - -#### Figure: `phosphoCorrectedPCA.pdf` - -This figure shows the results of a principal component analysis (PCA) performed on the -phosphoproteomics data after normalization and batch correction using ComBat. Prior to correction, -samples clustered primarily by cohort, reflecting a strong batch effect. After adjustment, the major -sources of variation correspond more closely to biological factors, such as patient identity and -tumor replicate. Each point represents an individual sample, colored by patient and shaped by tumor -designation, enabling visualization of patient-specific clustering patterns. The correction -substantially reduced separation by cohort, suggesting that technical batch effects were effectively -mitigated and that the remaining signal more likely reflects underlying biological differences in -phosphoproteome profiles. - -#### Figure: `globalCorrectedPCA.pdf` - -This figure presents a PCA of the global proteomics dataset following the same normalization and -ComBat batch correction procedure. Similar to the phosphoproteomics analysis, initial clustering was -dominated by cohort effects; however, correction aligned the data so that patient identity, rather -than cohort, explained the primary axes of variation. Each sample is again colored by patient and -shaped by tumor replicate, allowing assessment of within-patient reproducibility and between-patient -divergence. The improved alignment indicates that batch effects were successfully minimized, and the -corrected data provide a more reliable basis for downstream analyses of patient- and tumor-specific -proteomic signatures. - -### Biomarker Evaluation using Batch-Corrected Global / Phosphoproteomics Data - -For this biomarker evaluation analysis, we brought together four different data types collected on -the same patient-derived samples: - -- **Drug sensitivity measurements:** single-dose viability values (and some full dose–response curves) across a large panel of compounds. For drugs with multiple doses enabling curve fitting, dose-response fit_auc (area under the fitted viability curve; lower AUC = higher sensitivity) was also analyzed. -- **RNA sequencing:** transcript abundance data (to be harmonized into long format in subsequent steps). -- **Global proteomics:** normalized protein abundance values, batch-corrected across cohorts. -- **Phosphoproteomics:** normalized and batch-corrected site-specific phosphorylation abundances. - -To enable cross-modality analyses, all datasets were filtered to retain only those specimens present -in both the drug response data and at least one molecular modality. This produced a shared set of -samples suitable for correlation and biomarker discovery analyses. - -- **Proteomics:** Both global and phosphoproteomic data were restructured into sample-by-feature matrices, where rows represent specimens and columns represent either proteins (global) or phosphosites (phospho). Abundance values correspond to batch-corrected, log-scaled measurements. -- **Drug sensitivity:** Viability data (expressed in relative units of surviving fraction at given doses) were reformatted into a specimen-by-drug matrix, with average values across replicates used where multiple measurements existed. Separately, a fit_auc specimen×drug matrix was constructed by averaging per-drug fit_auc across replicates, retaining only drugs present in syn69947322. -- **Filtering:** Only drugs with complete measurements across all available specimens were retained for certain analyses, yielding a consistent comparison set ("full drugs"). For fit_auc, additional coverage summaries (mean, SD, n specimens per drug) were calculated to support ranking and visualization. -- **Drug Count:** 238 - -
-Drug list - -- Abemaciclib -- ABT-737 -- Adagrasib -- Adavosertib -- Afatinib -- Alectinib -- Alisertib -- Alpelisib -- Alvespimycin -- Alvocidib -- Anlotinib -- Apitolisib -- ARS-1620 -- Avapritinib -- AVL-292 -- Avutometinib -- Axitinib -- Barasertib -- Batoprotafib -- Belinostat -- Belzutifan -- BI-3406 -- BI-847325 -- BI-D1870 -- Bimiralisib -- Binimetinib -- Birinapant -- BLU9931 -- BMS-536924 -- Bosutinib -- Brigatinib -- Brivanib -- Brukinsa -- Brustaol -- Cabozantinib -- Calquence -- Capivasertib -- Capmatinib -- CBL0137 -- CCT-018159 -- Cedazuridine -- Ceralasertib -- Cerdulatinib -- Ceritinib -- CFT8634 -- Cobimetinib -- Copanlisib -- CPI-613 -- Crenolanib -- Crizotinib -- Dabrafenib -- Dacomitinib -- Danusertib -- Daporinad -- Daprodustat -- Dasatinib -- Defactinib -- Derazantinib -- Digoxin -- Dinaciclib -- Dovitinib -- Doxorubicin -- Duvelisib -- Eganelisib -- Elimusertib -- Enasidenib -- Encorafenib -- Enitociclib -- Ensartinib -- Entinostat -- Entrectinib -- Enzastaurin -- Everolimus -- Fadraciclib -- Famotidine -- Fedratinib -- Fimepinostat -- Fisogatinib -- FRAX597 -- Futibatinib -- Galunisertib -- Ganetespib -- Gefitinib -- Geldanamycin -- Gilteritinib -- Glasdegib -- GNE-617 -- GSK2256098 -- H 89 2HCl -- Ibrutinib -- Idasanutlin -- IKK-16 -- Imatinib -- INCB188053 -- INCB191856 -- Infigratinib -- Ipatasertib -- Ivosidenib -- JNJ-7706621 -- Ketotifen -- KO-947 -- KW-2478 -- Lapatinib -- Larotrectinib -- Lenvatinib -- Lestaurtinib -- Letrozole -- Lifirafenib -- Linsitinib -- Lorlatinib -- Losartan -- Losmapimod -- Lovastatin -- Luminespib -- LY231514 -- LY3023414 -- Manumycin A -- Merestinib -- Metformin -- Methotrexate -- Midostaurin -- Mirdametinib -- MK-2206 -- ML 210 -- MLN2480 -- Molibresib -- Napabucasin -- Navitoclax -- Nedisertib -- Nedometinib -- Neratinib -- Nilotinib -- Nintedanib -- Nirogacestat -- NT219 -- NVP-BEZ235 -- Octreotide -- Olaparib -- Olutasidenib -- Omipalisib -- Onalespib -- Onametostat -- Onvansertib -- Osimertinib -- Pacritinib -- Palbociclib -- Pan-RAS-IN-1 -- Panobinostat -- Pazopanib -- PCNA-I1 -- Pelabresib -- Pemigatinib -- Pemrametostat -- PF-562271 -- Pimozide -- Pirtobrutinib -- PLX-3397 -- Ponatinib -- Pralsetinib -- Prexasertib -- PT2385 -- Quizartinib -- Ranitidine -- Rapamycin -- Ravoxertinib -- Regorafenib -- Repotrectinib -- Retaspimycin -- Ribociclib -- Ripretinib -- RMC-6236 -- RO4929097 -- Rogaratinib -- Romidepsin -- Roxadustat -- Rucaparib -- Ruxolitinib -- Sapanisertib -- SAR405838 -- Seclidemstat -- Selinexor -- Selpercatinib -- Selumetinib -- Sertraline -- Siremadlin -- Sitravatinib -- SNX-2112 -- SNX-5422 -- Sonidegib -- Sorafenib -- Sotatercept -- Sotorasib -- Subasumstat -- Sulfasalazine -- Sunitinib -- Surufatinib -- SW106065 -- Tadalafil -- TAE226 -- TAK-243 -- Tanespimycin -- Tazemetostat -- TED-347 -- Tegavivint -- Telaglenastat -- Temozolomide -- Temsirolimus -- Tepotinib -- THZ1 -- Ticlopidine -- Tipifarnib -- Tivantinib -- Tivozanib -- TK216 -- Tofacitinib -- Tomivosertib -- Tovorafenib -- Tozasertib -- Trametinib -- Tretinoin -- Trilaciclib -- Triptolide -- Ulixertinib -- Umbralisib -- UNC2025 -- Unesbulin -- Vactosertib -- Vandetanib -- Vemurafenib -- Venetoclax -- Vismodegib -- Vistusertib -- Vociprotafib -- Volasertib -- Vorinostat -- VS-6766 -- Y-27632 -- Zanzalintinib -
- -### Exploratory Analysis of Drug Responses - -We performed initial exploration of the drug dataset to understand variability and efficacy across -the compound panel. - -### Correlation of molecular features with drug response - -Spearman correlations were computed between every drug (response profile) and every molecular -feature (protein or phosphosite). - -Significance values were estimated via permutation-based correlation tests, with multiple testing -correction applied (FDR). - -Features were separated into positive correlations (higher abundance/phosphorylation associated with -higher viability, i.e. resistance) and negative correlations (higher abundance/phosphorylation -associated with lower viability, i.e. sensitivity). - -#### Figure: `most_efficacious_.png/pdf` - -This scatterplot highlights the subset of drugs that were most efficacious across the patient -samples, defined as compounds with an average cell viability below 0.5 (i.e., less than 50% -survival). Each point represents a drug, positioned by its mean viability (y-axis) and labeled along -the x-axis. Point size reflects the variability in response across specimens, while point color -indicates the number of samples tested. Several compounds demonstrate both strong overall activity -and consistent performance across patients, suggesting broad-spectrum effectiveness. - -#### Figure: `most_variable_.png/pdf` - -This scatterplot highlights drugs with the highest variability in response across specimens, defined -as those with a standard deviation greater than 0.15. Here, each point again represents a drug, with -mean viability on the y-axis and drug identity on the x-axis. Point size encodes variability, and -color denotes the number of samples measured. Unlike the most efficacious plot, these compounds are -not necessarily the most potent but instead show strong heterogeneity between patients. Such drugs -may provide the greatest opportunity for biomarker discovery, as differences in response are more -likely to be explained by underlying molecular features. - -#### Figure: `_drug_heatmap_large_viability.pdf` - -This heatmap displays drug response values (viability) for the subset of drugs measured consistently -across all specimens ("full drugs"). Rows correspond to patient samples and columns to drugs. -Hierarchical clustering of both rows and columns highlights patterns of similarity, revealing groups -of patients with shared sensitivity profiles as well as clusters of compounds with correlated -activity. The visualization provides a global overview of drug response heterogeneity across the -cohort, serving as a baseline reference for linking molecular features to therapeutic sensitivity. - -#### Figure: `_cor_features_by_drug.pdf` - -Bar charts summarizing, for each drug, the number of molecular features (proteins or phosphosites) -that correlate with response at FDR < 0.25, separated by direction (features associated with -resistance vs sensitivity) and faceted by data modality. These counts provide a quick sense of which -drugs show the richest correlational signal. - -**Additional fit_auc figures:** -- Figure: most_sensitive_auc.pdf — scatter plot of drugs ranked by mean fit_auc (y-axis); point size = SD of fit_auc across specimens; color = number of specimens. -- Figure: most_variable_auc.pdf — scatter plot of drugs ranked by SD of fit_auc (y-axis); point size = number of specimens. - -### Functional enrichment of correlated features - -Using the leapR package, correlated features were ranked and tested for enrichment against gene set -collections.. - -Enrichment was conducted in a direction-aware manner: - -"Top" (positively correlated features) highlight pathways enriched among resistance-associated -features. - -"Bottom" (negatively correlated features, flipped in rank) highlight pathways enriched among -sensitivity-associated features. - -Results were summarized at multiple levels: - -- **Pathways per drug:** Number of significant pathways identified for each drug, separated by resistant vs sensitive associations. -- **Pathways across drugs:** Top 15 most recurrent pathways across the full drug panel, faceted by proteome vs phosphoproteome. -- **Drug-specific profiles:** Bar charts of the top 15 pathways per drug and modality, annotated with significance stars (* <0.05, ** <0.01, *** <0.001) - -#### Figure: `pathways_across_drugs_top15.pdf` - -Pathway-level summary of enrichment results across the full drug panel. Bars show the top 15 -pathways (per modality) most frequently enriched among significant feature–response correlations -(FDR < 0.05), with direction indicating whether pathways are associated with relative resistance -(features positively correlated with viability) or sensitivity (features negatively correlated with -viability). This highlights recurrent biological programs linked to drug response. - -Generalized figures: - -Generalized pathway enrichment figures were generated for the two most efficacious drugs, the two -most variable drugs, and Onalespib. - -#### Figure: `pathways_____top15.pdf` - -Each pathway figure (pathways_____top15.pdf) displays the top -15 enriched pathways whose molecular features are most strongly associated with the drug response -profile for . The field indicates whether enrichment was performed on global -proteomics (global) or phosphoproteomics (phospho) data. The field specifies whether the -pathways are associated with relative resistance (resistant, features positively correlated with -drug response) or relative sensitivity (sensitive, features negatively correlated with drug -response). The field denotes whether correlations were computed using single-dose -viability (viability) or dose-response fit_auc (fit_auc). In all cases, pathways are ranked by -−log10(FDR), and significance is annotated using standard thresholds (* <0.05, ** <0.01, *** -<0.001). These figures collectively summarize the biological programs whose protein abundance or -phosphorylation levels track with either enhanced or diminished drug effect across patient samples. - -Note on Onalespib (and other uniformly potent drugs): -Onalespib was one of the most efficacious compounds in the panel, but because nearly all samples -were highly sensitive, the variability across specimens was minimal. Correlation-based enrichment -relies on inter-patient heterogeneity; without it, even strong biological effects cannot be linked -robustly to specific pathways. This explains why Onalespib yields few significant pathways despite -its overall potency. - -### RNA-Seq / Differential Expression Analysis / Gene Set Enrichment - -Samples and preprocessing. -RNA-seq quantifications (Salmon gene-level quant.genes.sf) were retrieved for two conditions from -the same organoid line (NF0021-T1): - -1. untreated organoids -2. Onalespib (1 µM) treated organoids - -Raw read counts were assembled into a gene × sample matrix and filtered to retain genes with total -counts ≥ 10 across the two samples. Size-factor normalization was performed with DESeq2 to obtain -normalized counts. - -### Effect-size estimation (no replication) - -Because we have one sample per condition, formal differential testing is not performed. Instead, we -report exploratory effect sizes: per-gene log2 fold-change (log2FC) computed from normalized counts -as log2FC = log2((Onalespib + 1) / (Untreated + 1)). These values support ranking and visualization, -and hypothesis generation but should be interpreted as descriptive (no p-values). - -### Ranked list for enrichment - -Symbols were converted to ENTREZ ID; when multiple rows mapped to the same ENTREZ, we used the -median log2FC per ENTREZ for enrichment. - -Gene set enrichment analysis (GSEA). -We ran GSEA using clusterProfiler using: - -MSigDB Hallmark (H) via msigdbr (concise, non-redundant "meta-pathways"), - -GO Biological Process (BP) via gseGO - -Both dotplots display GeneRatio on the x-axis (fraction of the pathway's genes that are enriched) -and count as point size. - -#### Figure: `heatmap_top30_all_log2fc.pdf` - -Top 30 transcripts by |log2FC| (including unmapped IDs). -A clustered heatmap of the strongest responders (by absolute log2FC) using normalized counts -centered per gene. Labels use symbols when available, otherwise RefSeq base IDs. This heatmap -preserves completeness and has all of the true top changes. - -#### Figure: `heatmap_top30_mapped_log2fc.pdf` - -Same selection logic but restricted to symbol-mapped genes for the heatmap. This is much more -interpretable and cleaner to share but is technically missing the top differences. - -#### Figure: `gsea_dot_GSEA-MSigDB_Hallmark.pdf` - -Dotplot of MSigDB Hallmark GSEA results -This figure displays enriched Hallmark gene sets ranked by normalized enrichment score, with points -colored by enrichment direction, sized by the number of leading-edge genes, and positioned by -GeneRatio. - -#### Figure: `gsea_dot_GSEA-GO_BP_.pdf` - -Dotplot of GO Biological Processes results -The figure shows enriched GO BP terms from the ranked gene list, with points colored by enrichment -direction, sized by leading-edge gene count, and positioned by GeneRatio. - + - *Sources:* `../00_cNF_helper_code.R`, `../03_leapr_biomarker.R` + - *Goal:* Direction-aware pathway enrichment (leapR) using correlated features (resistant & sensitive). + - *Outputs (examples):* + - Per-drug pathway barplots (e.g., `pathways____top15.pdf`) + - Summary plots across drugs (e.g., top recurrent pathways) + +## Helper scripts (sourced by notebooks) +- **00_cNF_helper_code.R**: Shared utilities (Synapse helpers, plotting helpers, common metadata). +- **01_normalize_batchcorrect_omics.R**: Normalization / Joining Batches / ComBat batch correction code. +- **02_analyze_modality_correlations.R**: Drug summary plots, correlations (includes separated drug-only + modality-only functions). +- **03_leapr_biomarker.R**: Directional feature ranking + leapR enrichment + pathway plotting. From 1e6f9c36738c5ce2b447c5ae1e16ec49f0fc23b5 Mon Sep 17 00:00:00 2001 From: Jeremy Date: Tue, 27 Jan 2026 12:17:58 -0800 Subject: [PATCH 11/13] removed html files --- analysis/01_run_normalize_omics.html | 659 --------------------------- analysis/02_analyze_modality.html | 591 ------------------------ analysis/03_pathway_enrichment.html | 590 ------------------------ 3 files changed, 1840 deletions(-) delete mode 100644 analysis/01_run_normalize_omics.html delete mode 100644 analysis/02_analyze_modality.html delete mode 100644 analysis/03_pathway_enrichment.html diff --git a/analysis/01_run_normalize_omics.html b/analysis/01_run_normalize_omics.html deleted file mode 100644 index e49d4f0..0000000 --- a/analysis/01_run_normalize_omics.html +++ /dev/null @@ -1,659 +0,0 @@ - - - - - - - - - - - - - - - -Normalize Omics - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - -
-

Purpose

-

This notebook is designed to process multiple batches from -transcriptomics, proteomics, and phosphoproteomics data.

-

In its simplest form, it will load the data in from synapse and join -it together in a single matrix. Additional options include batch -correction (needed for global proteomics and phosphoproteomics), batch -correction plots (before and after), and uploading the data back to -synapse.

-
-
-

Get Helper Scripts

-

These scripts contain the functions that we will call below. - -00_cNF_helper_code.R is a basic script that pulls data and sets colors -for samples. - 01_normalize_batchcorrect_omics.R contains all of the -code for retrieving batches from synapse, performing batch correction, -plotting, and uploading.

-
library(synapser)
-syn <- list(get = synapser::synGet, store = synapser::synStore)
-
-# Load helper metadata (00_cNF_helper_code.R defines 'meta' and 'pcols')
-source("00_cNF_helper_code.R")
-
-# Source the pipeline
-source("01_normalize_batchcorrect_omics.R")
-
-
-

Run Batch Correction/Normalization across RNA, Global and Phospho -Data.

-
-

Clean Data

-

Here are the IDs of several samples that were used for protocol -optimization but not designed as experimental samples. We want to remove -these from the subsequent batch correction steps as they could skew the -“real” results. - Add to this list if more protocol optimization, -contaminated samples or anything else we want to remove is present.

-
# Substrings to drop (These were the protocol optimization samples)
-drop_subs <- c(
-  "cNF_organoid_DIA_G_02_11Feb25",
-  "cNF_organoid_DIA_G_05_11Feb25",
-  "cNF_organoid_DIA_G_06_11Feb25",
-  "cNF_organoid_DIA_P_02_29Jan25",
-  "cNF_organoid_DIA_P_05_11Feb25",
-  "cNF_organoid_DIA_P_06_11Feb25"
-)
-
-
-

The Main Function

-

We use run_modality() to perform all of the described -steps.

-

Brief Summary:

-
    -
  • Input: We assign batches to the synapse files using the -batches vector along with several other descriptive details -/ params.
  • -
  • Output: We get out a combined matrix of data that is (optionally) -batch corrected.
  • -
-

Reference guide:

-
-
Inputs
-
    -
  • modality: “global”, “phospho”, or “rna”

  • -
  • batches : list of batch configs, e.g.

    -
      -
    • list( list(syn_id=“syn…”, cohort=1, value_start_col=5, -fname_aliquot_index=8), list(syn_id=“syn…”, cohort=2, value_start_col=5, -fname_aliquot_index=9) )

    • -
    • Required per batch:

      -
        -
      • syn_id : Synapse file ID for the wide feature×sample table
      • -
      • cohort : cohort/batch label (used for joining + ComBat)
      • -
      • fname_aliquot_index: token index (split on “_“) used to parse -aliquot from sample headers
      • -
    • -
    • Optional per batch:

      -
        -
      • value_start_col: first sample column index (auto-detected if -omitted)
      • -
    • -
  • -
  • meta: sample metadata joined by (aliquot, cohort); used to -populate Specimen/Patient/Tumor

  • -
  • syn: synapser client

  • -
-
-
-
Options
-
    -
  • drop_name_substrings: regex pattern(s) to drop unwanted sample -columns
  • -
  • out_dir / out_prefix / save_basename: control output file -locations/names
  • -
  • write_outputs: write CSV/PDF (+upload) vs return in-memory only
  • -
  • upload_parent_id: Synapse folder/project ID for uploads
  • -
  • pcols: optional named colors for PCA (Patient -> color)
  • -
  • do_batch_correct: run ComBat (TRUE) or skip (FALSE)
  • -
-
-
-
What it does
-
    -
  • Normalizes each batch (per modality), combines batches on shared -features, optionally runs ComBat by cohort, then exports long tables + -PCA/hist plots.
  • -
-
-
-
Returns
-
    -
  • SummarizedExperiments: se_batches, se_combined, se_post (and -se_corrected if ComBat ran)
  • -
  • Long tables: long_pre, long_post
  • -
  • PCA data + plots: pca_df_pre/post, plots
  • -
-
-
-
-

RNA Batches

-

I am starting with the RNA batches as these are the simplest ones, -not actually requiring batch correction (seen in the plot titled “rna -samples” after the code runs).

-

I’m going to describe the batches vector/argument that -is required by the run_modality script here.

-
    -
  • A different version of this is needed when more batches are present, -omics changes, or the files are modified.
  • -
  • Each file (even within omics type) may be formatted slightly -differently so value_start_col and -fname_aliquot_index may need to be set differently between -batches.
  • -
-

For the rna_batches vector, files are formatted identically so the -value_start_col are both set to 3 and -fname_aliquot_index is NULL.

-
    -
  • value_start_col: First column where data -begins.
  • -
  • fname_aliquot_index: aliquot index (not used in -RNA)
  • -
  • cohort: this variable is used to set the batch -number.
  • -
-

This code is also written to handle different name formats in the -headers such as NF0018.T1.organoid, NF0018.T1.tissue, NF0018.skin, -NF0018.T2.organoids, NF0018.T2, NF0018.T3.organoids, NF0018.T3. ie: -organoid vs organoids, or mixed up capitalization.

-
rna_batches <- list(
-  list(syn_id = "syn66352931", cohort = 1, value_start_col = 3, fname_aliquot_index = NULL),
-  list(syn_id = "syn70765053", cohort = 2, value_start_col = 3, fname_aliquot_index = NULL)
-)
-
-rna <- run_modality(
-  modality = "rna",
-  batches  = rna_batches,
-  meta     = meta,
-  syn      = syn,
-  drop_name_substrings = drop_subs,
-  out_dir          = "rna_test",
-  out_prefix       = "rna",
-  upload_parent_id = "syn71099587",
-  pcols            = pcols,
-  write_outputs    = FALSE,     # This has already been run so we don't need to write it again unless something changes.
-  save_basename    = "RNA_12_no_batch_correct",
-  do_batch_correct = FALSE      #Note this is set to false right now. Doesn't appear to be needed for RNA 
-)
-

-In the plots above, we see that no batch correction was required or -performed.

-
-
-

Global Proteomic Batches

-

Okay, now we are moving on to global proteomics. This is run in the -same functional method as the RNA data. We set our batch info, assign -modality and other basic info in run_modality and then we -run it. Again, we don’t upload the data (by default atleast). However, -we do batch correct here - the reason why can be seen in the before and -after plots.

-

Here we do use the fname_aliquot_index variable in -batches. This essentially splits by underscore and takes the Xth -value

-

Example for exp2 where fname_aliquot_index is 6:

-
    -
  • D:\DMS_WorkDir1\cNF_organoid_exp2_DIA_G_**01**_01May25_Romulus_BEHCoA-25-02-16.mzML: -01
  • -
  • D:\DMS_WorkDir1\cNF_organoid_exp2_DIA_G_**02**_01May25_Romulus_BEHCoA-25-02-16.mzML: -02
  • -
-
global_batches <- list(
-  list(syn_id = "syn69947355", cohort = 1, value_start_col = 5, fname_aliquot_index = 5),
-  list(syn_id = "syn69947352", cohort = 2, value_start_col = 5, fname_aliquot_index = 6)
-)
-
-global <- run_modality(
-  modality = "Global",
-  batches  = global_batches,
-  meta     = meta,
-  syn      = syn,
-  drop_name_substrings = drop_subs,
-  out_dir          = "global_test",
-  out_prefix       = "global",
-  upload_parent_id = "syn70078365",
-  pcols            = pcols,
-  write_outputs    = FALSE,     # This has already been run so we don't need to write it again unless something changes.
-  save_basename    = "global_batch12_corrected",
-  do_batch_correct = TRUE
-)
-

-
## Found 2 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
-

-Above we can see that the batch correction was successfully performed -for the global proteomic data. If anything changes or if new batches are -added, we can set write_outputs to TRUE and rerun.

-
-
-

Phospho Proteomic Batches

-

Finally, we run this same process for Phosphoproteomics. Again the -run_modality function is used, although the batch info and params are -updated. After this code is run, we will see that batch correction was -required and that it ran successfully.

-

Again we use the fname_aliquot_index variable in -batches. This essentially splits by underscore and takes the Xth -value

-

Example for exp1 where fname_aliquot_index is 8:

-
    -
  • I:\UserData\LeDay\Piehowski_orgonoids_Feb25\RawData\1338217_cNF_organoid_DIA_P_**04**_29Jan25_Ned_BEHCoA-25-01-02.raw: -04
  • -
  • I:\UserData\LeDay\Piehowski_orgonoids_Feb25\RawData\1338241_cNF_organoid_DIA_P_**01**_29Jan25_Ned_BEHCoA-25-01-02.raw: -01
  • -
-
phospho_batches <- list(
-  list(syn_id = "syn69963552", cohort = 1, value_start_col = 7, fname_aliquot_index = 8),
-  list(syn_id = "syn69947351", cohort = 2, value_start_col = 7, fname_aliquot_index = 9)
-)
-
-phospho <- run_modality(
-  modality = "Phospho",
-  batches  = phospho_batches,
-  meta     = meta,
-  syn      = syn,
-  drop_name_substrings = drop_subs,
-  out_dir          = "phospho_test",
-  out_prefix       = "phospho",
-  upload_parent_id = "syn70078365",
-  pcols            = pcols,
-  write_outputs    = FALSE, # This has already been run so we don't need to write it again unless something changes.
-  save_basename    = "phospho_batch12_corrected",
-  do_batch_correct = TRUE
-)
-

-
-
- - - - -
- - - - - - - - - - - - - - - diff --git a/analysis/02_analyze_modality.html b/analysis/02_analyze_modality.html deleted file mode 100644 index 316d8f5..0000000 --- a/analysis/02_analyze_modality.html +++ /dev/null @@ -1,591 +0,0 @@ - - - - - - - - - - - - - - - -Analyze Modality Correlations - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - -
-

Purpose

-

This notebook is designed to take the drug viability data and the -long-format omics tables (RNA, global, and phospho), then compute and -plot correlations between drug response and molecular features. These -long-format omics tables are batch corrected when necessary and have -been returned from the Normalize Omics Notebook.

-

The analyze_modality will load the inputs from Synapse, -build the matrices, save the plots, and return the correlation results -so we can quickly inspect them. This markdown file also uses a -pdf_to_png_if_possible function to help display the plots -directly in the knitted markdown file.

-
-
-

Get Helper Scripts

-

These scripts contain the functions that we will call below.

-
    -
  • 00_cNF_helper_code.R is a basic helper script (Synapse helpers, -metadata, and shared utilities).
  • -
  • 02_analyze_modality_correlations.R contains the code for building -matrices, plotting summaries, and computing correlations.
  • -
-
-
-

Load in Data

-

These are the four main inputs we need. Drug viability plus the three -long-format omics tables.

-
# Drugs
-drugs <- readr::read_tsv(synGet("syn69947322")$path)
-
-# RNA 
-rlong <- readr::read_csv(synGet("syn71333780")$path)
-
-# Global
-glong <- readr::read_csv(synGet("syn70078416")$path)
-
-# Phospho
-plong <- readr::read_csv(synGet("syn70078415")$path)
-
-
-

Run correlations per modality

-

Below, we run the same workflow three times, once per modality. The -only thing that changes is which omics table we use and which column -identifies the feature.

-

The RNA step calls analyze_modality() (end-to-end) -which:
-- Builds a sample x drug matrix and a sample x feature matrix.
-- Saves drug summary plots to the output directory (most_efficacious, -most_variable, and the heatmap).
-- Computes Spearman correlations between drug response and molecular -features.
-- Returns the results as a list so we can inspect them directly in -R.

-

For global and phospho, we call -analyze_modality_correlations() only, which:
-- Reuses the drug matrix from the RNA run.
-- Builds the sample x feature matrix for that modality.
-- Computes Spearman correlations between drug response and molecular -features.
-- Saves the correlation feature summary plot.

-

A few plot notes (these get written to outdir):
-- most_efficacious.pdf: drugs with the lowest mean -viability across samples. This is basically a “top hits” plot.
-- most_variable.pdf: drugs with the highest variability -across samples. These are the ones that may be more -sample-dependent.
-- drug_heatmap_large_viability.pdf: a large -sample-by-drug heatmap, but only for drugs measured in all samples. This -is useful for. clustering and spotting outliers. Samples/patients are -separated by lines, within each are 1-3 tumors.
-- cor_features_by_drug.pdf: counts of significant -correlated features per drug (split by direction). This is a quick way -to see which drugs have real signal in this modality.

-

Note: most_efficacious, -most_variable, and -drug_heatmap_large_viability are based on the drug response -table, not the omics table.
-They get recreated on every end-to-end run, but they are the same plots. -Because of that, we run the end-to-end workflow once (RNA), and then for -global/phospho we only run the modality-only correlation step.

-

Note 2: The end-to-end analyze_modality -function may take a long time to run (~30-45 min).

-
-

RNA

-
corr_rna <- analyze_modality(
-  fits        = drugs,            
-  df_long     = rlong,           
-  sample_col  = "Specimen",
-  feature_col = "feature_id",
-  value_col   = "correctedAbundance",
-  metric      = "uM_viability",
-  heatmap_filename = "rna_drug_heatmap_large_viability.pdf",
-  outdir = "new_figs"
-)
-
-# Copy the shared filenames to RNA-specific names so later modalities don't overwrite what we want to show here
-dir.create("new_figs", showWarnings = FALSE, recursive = TRUE)
-file.copy("new_figs/most_efficacious.pdf", "new_figs/rna_most_efficacious.pdf", overwrite = TRUE)
-file.copy("new_figs/most_variable.pdf",   "new_figs/rna_most_variable.pdf",   overwrite = TRUE)
-file.copy("new_figs/cor_features_by_drug.pdf", "new_figs/rna_cor_features_by_drug.pdf", overwrite = TRUE)
-

For RNA, the “features” are the rows we test in the correlation step, -and they come directly from the feature_id column in the -RNA long table (rlong). In most cases -feature_id is a gene ID (or a transcript ID, depending on -how the RNA file was generated). The workflow builds a sample × feature -matrix from RNA, then correlates every RNA feature against drug -viability for each drug.

-

If a drug shows a large number of significant RNA correlations (after -FDR filtering), that is a strong sign the RNA signal is tracking -response for that drug. In other words, the transcriptome is separating -the sensitive vs resistant samples for that drug in a consistent -way.

-

Below are the plots I show right after running RNA. This is the only -place I show the drug-only plots, because those are based on the drug -response table and are essentially the same no matter which omics table -we are using.

-

Positive Correlation indicates a higher -viability which means that there is lower -effect of the drug / higher resistance to the -drug.

-

Negative Correlation indicates a lower -viability which means that there is greater -effect of the drug / higher sensitivity to the -drug.

-

Metric used: uM_viability.

-

Files shown for RNA:
-- new_figs/rna_most_efficacious.pdf
-- new_figs/rna_most_variable.pdf
-- new_figs/rna_drug_heatmap_large_viability.pdf
-- new_figs/rna_cor_features_by_drug.pdf

-

-
-
-

Global Proteomics

-
corr_global <- analyze_modality_correlations(
-  df_long     = glong,           
-  sample_col  = "Specimen",
-  feature_col = "Gene",
-  value_col   = "correctedAbundance",
-  drug_mat    = corr_rna$drug_mat,
-  outdir      = "new_figs",
-  fdr_thresh  = 0.25
-)
-
-# Copy the shared filenames to global-specific names
-file.copy("new_figs/cor_features_by_drug.pdf", "new_figs/global_cor_features_by_drug.pdf", overwrite = TRUE)
-

For global proteomics, the features are proteins. In theory, these -may be closer to the phenotype than RNA.

-

Global correlation plot

-

-
-
-

Phospho Proteomics

-
corr_phospho <- analyze_modality_correlations(
-  df_long     = plong,           
-  sample_col  = "Specimen",
-  feature_col = "site",
-  value_col   = "correctedAbundance",
-  drug_mat    = corr_rna$drug_mat,
-  outdir      = "new_figs",
-  fdr_thresh  = 0.25
-)
-
-# Copy the shared filenames to phospho-specific names
-file.copy("new_figs/cor_features_by_drug.pdf", "new_figs/phospho_cor_features_by_drug.pdf", overwrite = TRUE)
-

For phospho, the features are phosphorylation sites. It is normal for -this to be noisier and higher-dimensional, but when a drug shows a -cluster of correlated sites, it can be really informative for the -mechanism - however, there is just one correlation here with 2 -features.

-

Here is the phospho correlation plot - for some reason, this one -doesn’t always load, so I’m adding the table too.

-
## NULL
-
corr_phospho$cor_summary
-
## # A tibble: 5 × 4
-##   drug        direction features meanCor
-##   <chr>       <chr>        <int>   <dbl>
-## 1 Daporinad   pos              1   0.847
-## 2 ML 210      neg              1  -0.853
-## 3 NVP-BEZ235  pos              2   0.864
-## 4 Tofacitinib pos              1   0.868
-## 5 Tovorafenib pos              1   0.891
-
-
- - - - -
- - - - - - - - - - - - - - - diff --git a/analysis/03_pathway_enrichment.html b/analysis/03_pathway_enrichment.html deleted file mode 100644 index 82e45f6..0000000 --- a/analysis/03_pathway_enrichment.html +++ /dev/null @@ -1,590 +0,0 @@ - - - - - - - - - - - - - - - -Pathway Enrichment - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - -
-

Purpose

-

This script runs pathway enrichment using leapR, using the drug -viability data plus the long-format omics tables (RNA, global, and -phospho).
-These long-format omics tables are batch corrected when necessary and -have been returned from the Normalize Omics Notebook.

-

The main idea is:
-- Split samples into “sensitive” vs “resistant” for each drug (based on -viability).
-- Run enrichment on each side.
-- Save the enrichment tables and plots so we can quickly scan which -pathways show up for which drugs.

-

This file is written to be simple and repeatable. It loads the inputs -from Synapse, runs the same workflow per modality, and stores results in -a cache file so we do not re-run heavy work unless we need to.

-

Note: This will take a couple of hours to run the -first time. Do not overwrite the cache unless -completely necessary.

-
-
-

Get Helper Scripts

-

These scripts contain the functions that we will call below.

-
    -
  • 00_cNF_helper_code.R is a basic helper script (Synapse helpers, -metadata, and shared utilities).
    -
  • -
  • 03_leapr_biomarker.R contains the leapR wrapper functions, caching, -and plotting helpers.
  • -
-
-
-

Load in Data

-

These are the four main inputs we need - just like in -analyze_modality. Drug viability plus the three long-format omics -tables.

-
# Drugs
-drugs <- readr::read_tsv(synGet("syn69947322")$path)
-
-# RNA 
-rlong <- readr::read_csv(synGet("syn71333780")$path)
-
-# Global
-glong <- readr::read_csv(synGet("syn70078416")$path)
-
-# Phospho
-plong <- readr::read_csv(synGet("syn70078415")$path)
-
-
-

Run Pathway Enrichment for Each Omics Modality

-

Below, we run the same workflow three times, once per modality. The -main things that change are:
-- which omics table we use - which column represents the feature - which -geneset library we test (krbpaths vs kinasesubstrates)

-

Each call to run_leapr_directional_one_cached() does a -few things:
-- Splits samples into “sensitive” vs “resistant” for each drug (based on -viability).
-- Runs enrichment for each drug and direction.
-- Writes CSV outputs (if write_csvs = TRUE).
-- Saves an .Rdata cache so future runs are fast unless inputs or -settings change.

-

One note about plots: run_leapr_directional_one_cached() -computes the enrichment results, but it does not automatically save the -pathway barplots.
-Those are generated by save_leapr_plots(). In this Rmd, I -intentionally save plots for just mirda and -olaparib so we do not generate hundreds of drug PDFs by -accident.

-

So, If you want more plots, just add more drugs to the -drugs = c(...) list for each modality.

-
-

RNA (krbpaths).

-

For RNA, the features come from the feature_id column in -the RNA long table (rlong). In most cases, this is a gene -or transcript ID. Here we use the krbpaths gene set -collection.

-
res_bio_rna <- run_leapr_directional_one_cached(
-  drugs        = drugs,
-  df_long      = rlong,
-  sample_col   = "Specimen",
-  feature_col  = "feature_id",
-  value_col    = "correctedAbundance",
-  omic_label   = "rna",
-  cache_path   = "leapR_RNA_krbpaths_enrichment_direction_split.Rdata",
-  write_csvs   = TRUE,
-  always_rerun = FALSE,
-  test_one     = FALSE,
-  geneset_name = "krbpaths"
-)
-
-# This is where pathway plot PDFs get created.
-# I keep the list short on purpose so the report stays readable.
-save_leapr_plots(
-  res_bio_rna,
-  omic_label = "rna",
-  drugs = c("Mirdametinib", "olaparib"),
-  top_n = 15
-)
-

Below are the RNA pathway plots created by -save_leapr_plots(). These files get written to -figs/.

-

-
-
-

Global Proteomics (krbpaths)

-

For global proteomics, the features are proteins in the -Gene column. We keep the same krbpaths gene -set collection so RNA and global are easier to compare.

-
res_bio_global <- run_leapr_directional_one_cached(
-  drugs        = drugs,
-  df_long      = glong,
-  sample_col   = "Specimen",
-  feature_col  = "Gene",
-  value_col    = "correctedAbundance",
-  omic_label   = "global",
-  cache_path   = "leapR_Global_krbpaths_enrichment_direction_split.Rdata",
-  write_csvs   = TRUE,
-  always_rerun = FALSE,
-  test_one     = FALSE,
-  geneset_name = "krbpaths"
-)
-
-save_leapr_plots(
-  res_bio_global,
-  omic_label = "global",
-  drugs = c("Mirdametinib", "olaparib"),
-  top_n = 15
-)
-

Here are the global pathway plots. Same idea as RNA, just using -protein abundance instead of RNA abundance.

-

-
-
-

Phospho Proteomics (kinasesubstrates)

-

For phospho, the features are phosphorylation sites in the -site column. Here we use a phospho-specific gene set -library, kinasesubstrates, because site-level maps more -cleanly to kinase activity than to general pathway collections.

-
res_bio_phospho <- run_leapr_directional_one_cached(
-  drugs        = drugs,
-  df_long      = plong,
-  sample_col   = "Specimen",
-  feature_col  = "site",
-  value_col    = "correctedAbundance",
-  omic_label   = "phospho",
-  cache_path   = "leapR_Phospho_kinasesubstrates_enrichment_direction_split.Rdata",
-  write_csvs   = TRUE,
-  always_rerun = FALSE,
-  test_one     = FALSE,
-  geneset_name = "kinasesubstrates"
-)
-
-save_leapr_plots(
-  res_bio_phospho,
-  omic_label = "phospho",
-  drugs = c("Mirdametinib", "olaparib"),
-  top_n = 15
-)
-

Here are the phospho pathway plots

-

-
-
-
-

Notes

-
    -
  • If you want to force a full rerun (ignore cache), set -always_rerun = TRUE.
    -
  • -
  • If you only want to test quickly on one drug, set -test_one = TRUE (useful for debugging).
    -
  • -
  • If you want plots for more drugs, add them to the -drugs = c(...) list in each save_leapr_plots() -call.
    -
  • -
  • All pathway plot files are saved under figs/ with names -like: -
      -
    • figs/pathways_{resistant|sensitive}_top.pdf.
    • -
  • -
-
- - - - -
- - - - - - - - - - - - - - - From 44ec712228879061149ee71a3963a4d69a77a636 Mon Sep 17 00:00:00 2001 From: Jeremy Date: Tue, 27 Jan 2026 12:18:53 -0800 Subject: [PATCH 12/13] updated readme --- analysis/README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/analysis/README.md b/analysis/README.md index 8b6c123..d18082a 100644 --- a/analysis/README.md +++ b/analysis/README.md @@ -1,7 +1,6 @@ # cNF multi-omics analysis pipeline overview Purpose: Reference for running the cNF analysis end-to-end and what each stage produces. The notebooks (`.Rmd`) are the primary analysis entry points; the `.R` scripts are sourced helpers. -The `.html` files are the knitted R markdown files from the latest run. ## Quick run order (and what each file sources) From 90838ca558734e505726395b0ab2853582c7c7fa Mon Sep 17 00:00:00 2001 From: Jeremy Date: Thu, 29 Jan 2026 16:46:31 -0800 Subject: [PATCH 13/13] Numerous updates: Readded HTML files. Added date in html, created source directory, added source links in analysis readme, changed source paths in files, added significant features plot in leapr anaysis, doc strings and comments updated --- analysis/01_run_normalize_omics.Rmd | 184 +++-- analysis/01_run_normalize_omics.html | 677 ++++++++++++++++++ analysis/02_analyze_modality.Rmd | 81 +-- analysis/02_analyze_modality.html | 598 ++++++++++++++++ analysis/03_pathway_enrichment.Rmd | 28 +- analysis/03_pathway_enrichment.html | 616 ++++++++++++++++ analysis/README.md | 23 +- .../00_cNF_helper_code.R | 4 +- .../01_normalize_batchcorrect_omics.R | 20 +- .../02_analyze_modality_correlations.R | 12 +- .../03_leapr_biomarker.R | 149 +++- 11 files changed, 2220 insertions(+), 172 deletions(-) create mode 100644 analysis/01_run_normalize_omics.html create mode 100644 analysis/02_analyze_modality.html create mode 100644 analysis/03_pathway_enrichment.html rename 00_cNF_helper_code.R => source/00_cNF_helper_code.R (74%) rename 01_normalize_batchcorrect_omics.R => source/01_normalize_batchcorrect_omics.R (98%) rename 02_analyze_modality_correlations.R => source/02_analyze_modality_correlations.R (97%) rename 03_leapr_biomarker.R => source/03_leapr_biomarker.R (84%) diff --git a/analysis/01_run_normalize_omics.Rmd b/analysis/01_run_normalize_omics.Rmd index 45c17a6..39b30a3 100644 --- a/analysis/01_run_normalize_omics.Rmd +++ b/analysis/01_run_normalize_omics.Rmd @@ -1,10 +1,7 @@ --- -title: "Normalize Omics" +title: "Normalize Omics And Batch Correct" author: "Jeremy Jacobson" -date: "2025-11-04" -output: - html_document: default - pdf_document: default +date: "`r format(Sys.Date(), '%B %d, %Y')`" --- ### Purpose @@ -15,11 +12,11 @@ In its simplest form, it will load the data in from synapse and join it together ### Get Helper Scripts -These scripts contain the functions that we will call below. -- 00_cNF_helper_code.R is a basic script that pulls data and sets colors for samples. +These scripts contain the functions that we will call below. +- 00_cNF_helper_code.R is a basic script that pulls data and sets colors for samples. - 01_normalize_batchcorrect_omics.R contains all of the code for retrieving batches from synapse, - performing batch correction, plotting, and uploading. - + performing batch correction, plotting, and uploading. + ```{r initiate, include=FALSE} knitr::opts_chunk$set(echo = TRUE) @@ -30,10 +27,10 @@ library(synapser) syn <- list(get = synapser::synGet, store = synapser::synStore) # Load helper metadata (00_cNF_helper_code.R defines 'meta' and 'pcols') -source("../00_cNF_helper_code.R") +source("../source/00_cNF_helper_code.R") # Source the pipeline -source("../01_normalize_batchcorrect_omics.R") +source("../source/01_normalize_batchcorrect_omics.R") ``` @@ -42,7 +39,7 @@ source("../01_normalize_batchcorrect_omics.R") #### Clean Data Here are the IDs of several samples that were used for protocol optimization but not designed as experimental samples. We want to remove these from the subsequent batch correction steps as they could skew the "real" results. -- Add to this list if more protocol optimization, contaminated samples or anything else we want to remove is present. +- Add to this list if more protocol optimization, contaminated samples or anything else we want to remove is present. ```{r} # Substrings to drop (These were the protocol optimization samples) @@ -59,72 +56,72 @@ drop_subs <- c( #### The Main Function -We use `run_modality()` to perform all of the described steps. +We use `run_modality()` to perform all of the described steps. -**Brief Summary**: +**Brief Summary**: -- Input: We assign batches to the synapse files using the `batches` vector along with several other descriptive details / params. -- Output: We get out a combined matrix of data that is (optionally) batch corrected. +- Input: We assign batches to the synapse files using the `batches` vector along with several other descriptive details / params. +- Output: We get out a combined matrix of data that is (optionally) batch corrected. -**Reference guide**: +**Reference guide**: -##### Inputs +##### Inputs -- modality: "global", "phospho", or "rna" -- batches : list of batch configs, e.g. +- modality: "global", "phospho", or "rna" +- batches : list of batch configs, e.g. - list( list(syn_id="syn...", cohort=1, value_start_col=5, fname_aliquot_index=8), list(syn_id="syn...", cohort=2, value_start_col=5, fname_aliquot_index=9) - ) - - - Required per batch: - - syn_id : Synapse file ID for the wide feature×sample table - - cohort : cohort/batch label (used for joining + ComBat) - - fname_aliquot_index: token index (split on "_") used to parse aliquot from sample headers - - - Optional per batch: - - value_start_col: first sample column index (auto-detected if omitted) - -- meta: sample metadata joined by (aliquot, cohort); used to populate Specimen/Patient/Tumor -- syn: synapser client - -##### Options - -- drop_name_substrings: regex pattern(s) to drop unwanted sample columns -- out_dir / out_prefix / save_basename: control output file locations/names -- write_outputs: write CSV/PDF (+upload) vs return in-memory only -- upload_parent_id: Synapse folder/project ID for uploads -- pcols: optional named colors for PCA (Patient -> color) -- do_batch_correct: run ComBat (TRUE) or skip (FALSE) - -##### What it does - -- Normalizes each batch (per modality), combines batches on shared features, optionally runs ComBat by cohort, then exports long tables + PCA/hist plots. - -##### Returns - -- SummarizedExperiments: se_batches, se_combined, se_post (and se_corrected if ComBat ran) -- Long tables: long_pre, long_post -- PCA data + plots: pca_df_pre/post, plots - -### RNA Batches - -I am starting with the RNA batches as these are the simplest ones, not actually requiring batch correction (seen in the plot titled "rna samples" after the code runs). - -I'm going to describe the `batches` vector/argument that is required by the `run_modality` script here. - -- A different version of this is needed when more batches are present, omics changes, or the files are modified. -- Each file (even within omics type) may be formatted slightly differently so `value_start_col` and `fname_aliquot_index` may need to be set differently between batches. - -For the rna_batches vector, files are formatted identically so the `value_start_col` are both set to 3 and `fname_aliquot_index` is NULL. - -- **value_start_col**: First column where data begins. -- **fname_aliquot_index**: aliquot index (not used in RNA) -- **cohort**: this variable is used to set the batch number. - -This code is also written to handle different name formats in the headers such as NF0018.T1.organoid, NF0018.T1.tissue, NF0018.skin, NF0018.T2.organoids, NF0018.T2, NF0018.T3.organoids, NF0018.T3. ie: organoid vs organoids, or mixed up capitalization. + ) + + - Required per batch: + - syn_id : Synapse file ID for the wide feature×sample table + - cohort : cohort/batch label (used for joining + ComBat) + - fname_aliquot_index: token index (split on "_") used to parse aliquot from sample headers + + - Optional per batch: + - value_start_col: first sample column index (auto-detected if omitted) + +- meta: sample metadata joined by (aliquot, cohort); used to populate Specimen/Patient/Tumor +- syn: synapser client + +##### Options + +- drop_name_substrings: regex pattern(s) to drop unwanted sample columns +- out_dir / out_prefix / save_basename: control output file locations/names +- write_outputs: write CSV/PDF (+upload) vs return in-memory only +- upload_parent_id: Synapse folder/project ID for uploads +- pcols: optional named colors for PCA (Patient -> color) +- do_batch_correct: run ComBat (TRUE) or skip (FALSE) + +##### What it does + +- Normalizes each batch (per modality), combines batches on shared features, optionally runs ComBat by cohort, then exports long tables + PCA/hist plots. + +##### Returns + +- SummarizedExperiments: se_batches, se_combined, se_post (and se_corrected if ComBat ran) +- Long tables: long_pre, long_post +- PCA data + plots: pca_df_pre/post, plots + +### RNA Batches + +I am starting with the RNA batches as these are the simplest ones, not actually requiring batch correction (seen in the plot titled "rna samples" after the code runs). +I'm going to describe the `batches` vector/argument that is required by the `run_modality` script here. + +- A different version of this is needed when more batches are present, omics changes, or the files are modified. +- Each file (even within omics type) may be formatted slightly differently so `value_start_col` and `fname_aliquot_index` may need to be set differently between batches. + +For the rna_batches vector, files are formatted identically so the `value_start_col` are both set to 3 and `fname_aliquot_index` is NULL. + +- **value_start_col**: First column where data begins. +- **fname_aliquot_index**: aliquot index (not used in RNA) +- **cohort**: this variable is used to set the batch number. + +This code is also written to handle different name formats in the headers such as NF0018.T1.organoid, NF0018.T1.tissue, NF0018.skin, NF0018.T2.organoids, NF0018.T2, NF0018.T3.organoids, NF0018.T3. ie: organoid vs organoids, or mixed up capitalization. + ```{r rna, message=FALSE, warning=FALSE, fig.height=3.5, fig.width=6, out.width="50%"} rna_batches <- list( @@ -147,20 +144,20 @@ rna <- run_modality( do_batch_correct = FALSE #Note this is set to false right now. Doesn't appear to be needed for RNA ) ``` -In the plots above, we see that no batch correction was required or performed. - -### Global Proteomic Batches - -Okay, now we are moving on to global proteomics. This is run in the same functional method as the RNA data. We set our batch info, assign modality and other basic info in `run_modality` and then we run it. Again, we don't upload the data (by default atleast). However, we do batch correct here - the reason why can be seen in the before and after plots. - -Here we do use the `fname_aliquot_index` variable in batches. This essentially splits by underscore and takes the Xth value - -Example for exp2 where fname_aliquot_index is 6: - -- `D:\DMS_WorkDir1\cNF_organoid_exp2_DIA_G_**01**_01May25_Romulus_BEHCoA-25-02-16.mzML`: 01 -- `D:\DMS_WorkDir1\cNF_organoid_exp2_DIA_G_**02**_01May25_Romulus_BEHCoA-25-02-16.mzML`: 02 - +In the plots above, we see that no batch correction was required or performed. + +### Global Proteomic Batches + +Okay, now we are moving on to global proteomics. This is run in the same functional method as the RNA data. We set our batch info, assign modality and other basic info in `run_modality` and then we run it. Again, we don't upload the data (by default atleast). However, we do batch correct here - the reason why can be seen in the before and after plots. + +Here we do use the `fname_aliquot_index` variable in batches. This essentially splits by underscore and takes the Xth value + +Example for exp2 where fname_aliquot_index is 6: + +- `D:\DMS_WorkDir1\cNF_organoid_exp2_DIA_G_**01**_01May25_Romulus_BEHCoA-25-02-16.mzML`: 01 +- `D:\DMS_WorkDir1\cNF_organoid_exp2_DIA_G_**02**_01May25_Romulus_BEHCoA-25-02-16.mzML`: 02 + ```{r global, message=FALSE, warning=FALSE, fig.height=3.5, fig.width=6, out.width="50%"} global_batches <- list( list(syn_id = "syn69947355", cohort = 1, value_start_col = 5, fname_aliquot_index = 5), @@ -184,19 +181,20 @@ global <- run_modality( ``` -Above we can see that the batch correction was successfully performed for the global proteomic data. If anything changes or if new batches are added, we can set write_outputs to TRUE and rerun. - -### Phospho Proteomic Batches - -Finally, we run this same process for Phosphoproteomics. Again the run_modality function is used, although the batch info and params are updated. After this code is run, we will see that batch correction was required and that it ran successfully. - -Again we use the `fname_aliquot_index` variable in batches. This essentially splits by underscore and takes the Xth value - -Example for exp1 where fname_aliquot_index is 8: - -- `I:\UserData\LeDay\Piehowski_orgonoids_Feb25\RawData\1338217_cNF_organoid_DIA_P_**04**_29Jan25_Ned_BEHCoA-25-01-02.raw`: 04 -- `I:\UserData\LeDay\Piehowski_orgonoids_Feb25\RawData\1338241_cNF_organoid_DIA_P_**01**_29Jan25_Ned_BEHCoA-25-01-02.raw`: 01 + +Above we can see that the batch correction was successfully performed for the global proteomic data. If anything changes or if new batches are added, we can set write_outputs to TRUE and rerun. +### Phospho Proteomic Batches + +Finally, we run this same process for Phosphoproteomics. Again the run_modality function is used, although the batch info and params are updated. After this code is run, we will see that batch correction was required and that it ran successfully. + +Again we use the `fname_aliquot_index` variable in batches. This essentially splits by underscore and takes the Xth value. + +Example for exp1 where fname_aliquot_index is 8: + +- `I:\UserData\LeDay\Piehowski_orgonoids_Feb25\RawData\1338217_cNF_organoid_DIA_P_**04**_29Jan25_Ned_BEHCoA-25-01-02.raw`: 04 +- `I:\UserData\LeDay\Piehowski_orgonoids_Feb25\RawData\1338241_cNF_organoid_DIA_P_**01**_29Jan25_Ned_BEHCoA-25-01-02.raw`: 01 + ```{r phospho, message=FALSE, warning=FALSE, fig.height=3.5, fig.width=6, out.width="50%"} diff --git a/analysis/01_run_normalize_omics.html b/analysis/01_run_normalize_omics.html new file mode 100644 index 0000000..95d1a79 --- /dev/null +++ b/analysis/01_run_normalize_omics.html @@ -0,0 +1,677 @@ + + + + + + + + + + + + + + + +Normalize Omics And Batch Correct + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+

Purpose

+

This notebook is designed to process multiple batches from +transcriptomics, proteomics, and phosphoproteomics data.

+

In its simplest form, it will load the data in from synapse and join +it together in a single matrix. Additional options include batch +correction (needed for global proteomics and phosphoproteomics), batch +correction plots (before and after), and uploading the data back to +synapse.

+
+
+

Get Helper Scripts

+

These scripts contain the functions that we will call below.
+- 00_cNF_helper_code.R is a basic script that pulls data and sets colors +for samples.
+- 01_normalize_batchcorrect_omics.R contains all of the code for +retrieving batches from synapse, performing batch correction, plotting, +and uploading.

+
library(synapser)
+syn <- list(get = synapser::synGet, store = synapser::synStore)
+
+# Load helper metadata (00_cNF_helper_code.R defines 'meta' and 'pcols')
+source("../source/00_cNF_helper_code.R")
+
+# Source the pipeline
+source("../source/01_normalize_batchcorrect_omics.R")
+
+
+

Run Batch Correction/Normalization across RNA, Global and Phospho +Data.

+
+

Clean Data

+

Here are the IDs of several samples that were used for protocol +optimization but not designed as experimental samples. We want to remove +these from the subsequent batch correction steps as they could skew the +“real” results. - Add to this list if more protocol optimization, +contaminated samples or anything else we want to remove is present.

+
# Substrings to drop (These were the protocol optimization samples)
+drop_subs <- c(
+  "cNF_organoid_DIA_G_02_11Feb25",
+  "cNF_organoid_DIA_G_05_11Feb25",
+  "cNF_organoid_DIA_G_06_11Feb25",
+  "cNF_organoid_DIA_P_02_29Jan25",
+  "cNF_organoid_DIA_P_05_11Feb25",
+  "cNF_organoid_DIA_P_06_11Feb25"
+)
+
+
+

The Main Function

+

We use run_modality() to perform all of the described +steps.

+

Brief Summary:

+
    +
  • Input: We assign batches to the synapse files using the +batches vector along with several other descriptive details +/ params.
    +
  • +
  • Output: We get out a combined matrix of data that is (optionally) +batch corrected.
  • +
+

Reference guide:

+
+
Inputs
+
    +
  • modality: “global”, “phospho”, or “rna”

  • +
  • batches : list of batch configs, e.g. 

    +
      +
    • list( list(syn_id=“syn…”, cohort=1, value_start_col=5, +fname_aliquot_index=8), list(syn_id=“syn…”, cohort=2, value_start_col=5, +fname_aliquot_index=9) )

    • +
    • Required per batch:

      +
        +
      • syn_id : Synapse file ID for the wide feature×sample table
        +
      • +
      • cohort : cohort/batch label (used for joining + ComBat)
        +
      • +
      • fname_aliquot_index: token index (split on “_“) used to parse +aliquot from sample headers
      • +
    • +
    • Optional per batch:

      +
        +
      • value_start_col: first sample column index (auto-detected if +omitted)
      • +
    • +
  • +
  • meta: sample metadata joined by (aliquot, cohort); used to +populate Specimen/Patient/Tumor
    +

  • +
  • syn: synapser client

  • +
+
+
+
Options
+
    +
  • drop_name_substrings: regex pattern(s) to drop unwanted sample +columns
    +
  • +
  • out_dir / out_prefix / save_basename: control output file +locations/names
    +
  • +
  • write_outputs: write CSV/PDF (+upload) vs return in-memory +only
    +
  • +
  • upload_parent_id: Synapse folder/project ID for uploads
    +
  • +
  • pcols: optional named colors for PCA (Patient -> color)
    +
  • +
  • do_batch_correct: run ComBat (TRUE) or skip (FALSE)
  • +
+
+
+
What it does
+
    +
  • Normalizes each batch (per modality), combines batches on shared +features, optionally runs ComBat by cohort, then exports long tables + +PCA/hist plots.
  • +
+
+
+
Returns
+
    +
  • SummarizedExperiments: se_batches, se_combined, se_post (and +se_corrected if ComBat ran)
    +
  • +
  • Long tables: long_pre, long_post
    +
  • +
  • PCA data + plots: pca_df_pre/post, plots
  • +
+
+
+
+

RNA Batches

+

I am starting with the RNA batches as these are the simplest ones, +not actually requiring batch correction (seen in the plot titled “rna +samples” after the code runs).

+

I’m going to describe the batches vector/argument that +is required by the run_modality script here.

+
    +
  • A different version of this is needed when more batches are present, +omics changes, or the files are modified.
    +
  • +
  • Each file (even within omics type) may be formatted slightly +differently so value_start_col and +fname_aliquot_index may need to be set differently between +batches.
  • +
+

For the rna_batches vector, files are formatted identically so the +value_start_col are both set to 3 and +fname_aliquot_index is NULL.

+
    +
  • value_start_col: First column where data +begins.
    +
  • +
  • fname_aliquot_index: aliquot index (not used in +RNA)
    +
  • +
  • cohort: this variable is used to set the batch +number.
  • +
+

This code is also written to handle different name formats in the +headers such as NF0018.T1.organoid, NF0018.T1.tissue, NF0018.skin, +NF0018.T2.organoids, NF0018.T2, NF0018.T3.organoids, NF0018.T3. ie: +organoid vs organoids, or mixed up capitalization.

+
rna_batches <- list(
+  list(syn_id = "syn66352931", cohort = 1, value_start_col = 3, fname_aliquot_index = NULL),
+  list(syn_id = "syn70765053", cohort = 2, value_start_col = 3, fname_aliquot_index = NULL)
+)
+
+rna <- run_modality(
+  modality = "rna",
+  batches  = rna_batches,
+  meta     = meta,
+  syn      = syn,
+  drop_name_substrings = drop_subs,
+  out_dir          = "rna_test",
+  out_prefix       = "rna",
+  upload_parent_id = "syn71099587",
+  pcols            = pcols,
+  write_outputs    = FALSE,     # This has already been run so we don't need to write it again unless something changes.
+  save_basename    = "RNA_12_no_batch_correct",
+  do_batch_correct = FALSE      #Note this is set to false right now. Doesn't appear to be needed for RNA 
+)
+

+In the plots above, we see that no batch correction was required or +performed.

+
+
+

Global Proteomic Batches

+

Okay, now we are moving on to global proteomics. This is run in the +same functional method as the RNA data. We set our batch info, assign +modality and other basic info in run_modality and then we +run it. Again, we don’t upload the data (by default atleast). However, +we do batch correct here - the reason why can be seen in the before and +after plots.

+

Here we do use the fname_aliquot_index variable in +batches. This essentially splits by underscore and takes the Xth +value

+

Example for exp2 where fname_aliquot_index is 6:

+
    +
  • D:\DMS_WorkDir1\cNF_organoid_exp2_DIA_G_**01**_01May25_Romulus_BEHCoA-25-02-16.mzML: +01
    +
  • +
  • D:\DMS_WorkDir1\cNF_organoid_exp2_DIA_G_**02**_01May25_Romulus_BEHCoA-25-02-16.mzML: +02
  • +
+
global_batches <- list(
+  list(syn_id = "syn69947355", cohort = 1, value_start_col = 5, fname_aliquot_index = 5),
+  list(syn_id = "syn69947352", cohort = 2, value_start_col = 5, fname_aliquot_index = 6)
+)
+
+global <- run_modality(
+  modality = "Global",
+  batches  = global_batches,
+  meta     = meta,
+  syn      = syn,
+  drop_name_substrings = drop_subs,
+  out_dir          = "global_test",
+  out_prefix       = "global",
+  upload_parent_id = "syn70078365",
+  pcols            = pcols,
+  write_outputs    = FALSE,     # This has already been run so we don't need to write it again unless something changes.
+  save_basename    = "global_batch12_corrected",
+  do_batch_correct = TRUE
+)
+

+
## Found 2 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
+

+

Above we can see that the batch correction was successfully performed +for the global proteomic data. If anything changes or if new batches are +added, we can set write_outputs to TRUE and rerun.

+
+
+

Phospho Proteomic Batches

+

Finally, we run this same process for Phosphoproteomics. Again the +run_modality function is used, although the batch info and params are +updated. After this code is run, we will see that batch correction was +required and that it ran successfully.

+

Again we use the fname_aliquot_index variable in +batches. This essentially splits by underscore and takes the Xth +value.

+

Example for exp1 where fname_aliquot_index is 8:

+
    +
  • I:\UserData\LeDay\Piehowski_orgonoids_Feb25\RawData\1338217_cNF_organoid_DIA_P_**04**_29Jan25_Ned_BEHCoA-25-01-02.raw: +04
    +
  • +
  • I:\UserData\LeDay\Piehowski_orgonoids_Feb25\RawData\1338241_cNF_organoid_DIA_P_**01**_29Jan25_Ned_BEHCoA-25-01-02.raw: +01
  • +
+
phospho_batches <- list(
+  list(syn_id = "syn69963552", cohort = 1, value_start_col = 7, fname_aliquot_index = 8),
+  list(syn_id = "syn69947351", cohort = 2, value_start_col = 7, fname_aliquot_index = 9)
+)
+
+phospho <- run_modality(
+  modality = "Phospho",
+  batches  = phospho_batches,
+  meta     = meta,
+  syn      = syn,
+  drop_name_substrings = drop_subs,
+  out_dir          = "phospho_test",
+  out_prefix       = "phospho",
+  upload_parent_id = "syn70078365",
+  pcols            = pcols,
+  write_outputs    = FALSE, # This has already been run so we don't need to write it again unless something changes.
+  save_basename    = "phospho_batch12_corrected",
+  do_batch_correct = TRUE
+)
+

+
+
+ + + + +
+ + + + + + + + + + + + + + + diff --git a/analysis/02_analyze_modality.Rmd b/analysis/02_analyze_modality.Rmd index 21f777c..1f5611b 100644 --- a/analysis/02_analyze_modality.Rmd +++ b/analysis/02_analyze_modality.Rmd @@ -1,27 +1,22 @@ --- title: "Analyze Modality Correlations" author: "Jeremy Jacobson" -date: "2025-11-12" -output: html_document +date: "`r format(Sys.Date(), '%B %d, %Y')`" --- -### Purpose - -This notebook is designed to take the drug viability data and the long-format omics tables (RNA, global, and phospho), -then compute and plot correlations between drug response and molecular features. These long-format omics tables are batch corrected -when necessary and have been returned from the Normalize Omics Notebook. - -The `analyze_modality` will load the inputs from Synapse, build the matrices, save the plots, and return the -correlation results so we can quickly inspect them. This markdown file also uses a `pdf_to_png_if_possible` function to help display the -plots directly in the knitted markdown file. - -### Get Helper Scripts +### Purpose + +This notebook is designed to take the drug viability data and the long-format omics tables (RNA, global, and phospho), then compute and plot correlations between drug response and molecular features. These long-format omics tables are batch corrected when necessary and have been returned from the Normalize Omics Notebook. + +The `analyze_modality` will load the inputs from Synapse, build the matrices, save the plots, and return the correlation results so we can quickly inspect them. This markdown file also uses a `pdf_to_png_if_possible` function to help display the plots directly in the knitted markdown file. -These scripts contain the functions that we will call below. +### Get Helper Scripts -- 00_cNF_helper_code.R is a basic helper script (Synapse helpers, metadata, and shared utilities). -- 02_analyze_modality_correlations.R contains the code for building matrices, plotting summaries, and computing correlations. +These scripts contain the functions that we will call below. +- 00_cNF_helper_code.R is a basic helper script (Synapse helpers, metadata, and shared utilities). +- 02_analyze_modality_correlations.R contains the code for building matrices, plotting summaries, and computing correlations. + ```{r initiate, include=FALSE} knitr::opts_chunk$set(echo = TRUE) @@ -32,14 +27,14 @@ knitr::opts_chunk$set(echo = TRUE) ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) -source("../00_cNF_helper_code.R") -source("../02_analyze_modality_correlations.R") +source("../source/00_cNF_helper_code.R") +source("../source/02_analyze_modality_correlations.R") ``` -## Load in Data - -These are the four main inputs we need. Drug viability plus the three long-format omics tables. - +## Load in Data + +These are the four main inputs we need. Drug viability plus the three long-format omics tables. + ```{r load, echo=TRUE, results='hide', message=FALSE, warning=FALSE, cache=TRUE} # Drugs drugs <- readr::read_tsv(synGet("syn69947322")$path) @@ -56,8 +51,7 @@ plong <- readr::read_csv(synGet("syn70078415")$path) ## Run correlations per modality -Below, we run the same workflow three times, once per modality. The only thing that changes is which omics table -we use and which column identifies the feature. +Below, we run the same workflow three times, once per modality. The only thing that changes is which omics table we use and which column identifies the feature. The RNA step calls `analyze_modality()` (end-to-end) which: - Builds a sample x drug matrix and a sample x feature matrix. @@ -74,16 +68,15 @@ For global and phospho, we call `analyze_modality_correlations()` only, which: A few plot notes (these get written to `outdir`): - **most_efficacious.pdf**: drugs with the lowest mean viability across samples. This is basically a "top hits" plot. - **most_variable.pdf**: drugs with the highest variability across samples. These are the ones that may be more sample-dependent. -- **drug_heatmap_large_viability.pdf**: a large sample-by-drug heatmap, but only for drugs measured in all samples. This is useful for. clustering and spotting outliers. Samples/patients are separated by lines, within each are 1-3 tumors. +- **drug_heatmap_large_viability.pdf**: a large sample-by-drug heatmap, but only for drugs measured in all samples. This is useful for clustering and spotting outliers. Samples/patients are separated by lines, within each are 1-3 tumors. - **cor_features_by_drug.pdf**: counts of significant correlated features per drug (split by direction). This is a quick way to see which drugs have real signal in this modality. **Note**: `most_efficacious`, `most_variable`, and `drug_heatmap_large_viability` are based on the drug response table, not the omics table. -They get recreated on every end-to-end run, but they are the same plots. Because of that, we run the end-to-end workflow once (RNA), -and then for global/phospho we only run the modality-only correlation step. +They get recreated on every end-to-end run, but they are the same plots. Because of that, we run the end-to-end workflow once (RNA), and then for global/phospho we only run the modality-only correlation step. -**Note 2:** The end-to-end `analyze_modality` function may take a long time to run (~30-45 min). - -### RNA +**Note 2:** The end-to-end `analyze_modality` function may take a long time to run (~30-45 min). + +### RNA ```{r rna, echo=TRUE, results='hide', message=FALSE, warning=FALSE, cache=TRUE} corr_rna <- analyze_modality( @@ -111,12 +104,12 @@ If a drug shows a large number of significant RNA correlations (after FDR filter Below are the plots I show right after running RNA. This is the only place I show the drug-only plots, because those are based on the drug response table and are essentially the same no matter which omics table we are using. -**Positive Correlation** indicates a **higher viability** which means that there is **lower effect** of the drug / **higher resistance** to the drug. - -**Negative Correlation** indicates a **lower viability** which means that there is **greater effect** of the drug / **higher sensitivity** to the drug. - -Metric used: `uM_viability`. +**Positive Correlation** indicates a **higher viability** which means that there is **lower effect** of the drug / **higher resistance** to the drug. + +**Negative Correlation** indicates a **lower viability** which means that there is **greater effect** of the drug / **higher sensitivity** to the drug. +Metric used: `uM_viability` + Files shown for RNA: - new_figs/rna_most_efficacious.pdf - new_figs/rna_most_variable.pdf @@ -151,18 +144,18 @@ corr_global <- analyze_modality_correlations( file.copy("new_figs/cor_features_by_drug.pdf", "new_figs/global_cor_features_by_drug.pdf", overwrite = TRUE) ``` -For global proteomics, the features are proteins. In theory, these may be closer to the phenotype than RNA. - -Global correlation plot +For global proteomics, the features are proteins. In theory, these may be closer to the phenotype than RNA. +Global correlation plot + ```{r global_plots, echo=FALSE, message=FALSE, warning=FALSE, fig.height=6.5, fig.width=10, out.width="95%", cache=TRUE} show_plots(c( "new_figs/global_cor_features_by_drug.pdf" ), dpi = 220) ``` - -### Phospho Proteomics - + +### Phospho Proteomics + ```{r phospho, echo=TRUE, results='hide', message=FALSE, warning=FALSE, cache=TRUE} corr_phospho <- analyze_modality_correlations( df_long = plong, @@ -180,8 +173,8 @@ file.copy("new_figs/cor_features_by_drug.pdf", "new_figs/phospho_cor_features_by For phospho, the features are phosphorylation sites. It is normal for this to be noisier and higher-dimensional, but when a drug shows a cluster of correlated sites, it can be really informative for the mechanism - however, there is just one correlation here with 2 features. -Here is the phospho correlation plot - for some reason, this one doesn't always load, so I'm adding the table too. - +Here is the phospho correlation plot - for some reason, this one doesn't always knit correctly, so I'm adding the table too. + ```{r phospho_plots, echo=FALSE, message=FALSE, warning=FALSE, fig.height=6.5, fig.width=10, out.width="95%", cache=TRUE} show_plots(c( "new_figs/phospho_cor_features_by_drug.pdf" @@ -189,7 +182,7 @@ show_plots(c( ``` - + ```{r} corr_phospho$cor_summary ``` diff --git a/analysis/02_analyze_modality.html b/analysis/02_analyze_modality.html new file mode 100644 index 0000000..a976974 --- /dev/null +++ b/analysis/02_analyze_modality.html @@ -0,0 +1,598 @@ + + + + + + + + + + + + + + + +Analyze Modality Correlations + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+

Purpose

+

This notebook is designed to take the drug viability data and the +long-format omics tables (RNA, global, and phospho), then compute and +plot correlations between drug response and molecular features. These +long-format omics tables are batch corrected when necessary and have +been returned from the Normalize Omics Notebook.

+

The analyze_modality will load the inputs from Synapse, +build the matrices, save the plots, and return the correlation results +so we can quickly inspect them. This markdown file also uses a +pdf_to_png_if_possible function to help display the plots +directly in the knitted markdown file.

+
+
+

Get Helper Scripts

+

These scripts contain the functions that we will call below.

+
    +
  • 00_cNF_helper_code.R is a basic helper script (Synapse helpers, +metadata, and shared utilities).
    +
  • +
  • 02_analyze_modality_correlations.R contains the code for building +matrices, plotting summaries, and computing correlations.
  • +
+
+
+

Load in Data

+

These are the four main inputs we need. Drug viability plus the three +long-format omics tables.

+
# Drugs
+drugs <- readr::read_tsv(synGet("syn69947322")$path)
+
+# RNA 
+rlong <- readr::read_csv(synGet("syn71333780")$path)
+
+# Global
+glong <- readr::read_csv(synGet("syn70078416")$path)
+
+# Phospho
+plong <- readr::read_csv(synGet("syn70078415")$path)
+
+
+

Run correlations per modality

+

Below, we run the same workflow three times, once per modality. The +only thing that changes is which omics table we use and which column +identifies the feature.

+

The RNA step calls analyze_modality() (end-to-end) +which:
+- Builds a sample x drug matrix and a sample x feature matrix.
+- Saves drug summary plots to the output directory (most_efficacious, +most_variable, and the heatmap).
+- Computes Spearman correlations between drug response and molecular +features.
+- Returns the results as a list so we can inspect them directly in +R.

+

For global and phospho, we call +analyze_modality_correlations() only, which:
+- Reuses the drug matrix from the RNA run.
+- Builds the sample x feature matrix for that modality.
+- Computes Spearman correlations between drug response and molecular +features.
+- Saves the correlation feature summary plot.

+

A few plot notes (these get written to outdir):
+- most_efficacious.pdf: drugs with the lowest mean +viability across samples. This is basically a “top hits” plot.
+- most_variable.pdf: drugs with the highest variability +across samples. These are the ones that may be more +sample-dependent.
+- drug_heatmap_large_viability.pdf: a large +sample-by-drug heatmap, but only for drugs measured in all samples. This +is useful for clustering and spotting outliers. Samples/patients are +separated by lines, within each are 1-3 tumors.
+- cor_features_by_drug.pdf: counts of significant +correlated features per drug (split by direction). This is a quick way +to see which drugs have real signal in this modality.

+

Note: most_efficacious, +most_variable, and +drug_heatmap_large_viability are based on the drug response +table, not the omics table.
+They get recreated on every end-to-end run, but they are the same plots. +Because of that, we run the end-to-end workflow once (RNA), and then for +global/phospho we only run the modality-only correlation step.

+

Note 2: The end-to-end analyze_modality +function may take a long time to run (~30-45 min).

+
+

RNA

+
corr_rna <- analyze_modality(
+  fits        = drugs,            
+  df_long     = rlong,           
+  sample_col  = "Specimen",
+  feature_col = "feature_id",
+  value_col   = "correctedAbundance",
+  metric      = "uM_viability",
+  heatmap_filename = "rna_drug_heatmap_large_viability.pdf",
+  outdir = "new_figs"
+)
+
+# Copy the shared filenames to RNA-specific names so later modalities don't overwrite what we want to show here
+dir.create("new_figs", showWarnings = FALSE, recursive = TRUE)
+file.copy("new_figs/most_efficacious.pdf", "new_figs/rna_most_efficacious.pdf", overwrite = TRUE)
+file.copy("new_figs/most_variable.pdf",   "new_figs/rna_most_variable.pdf",   overwrite = TRUE)
+file.copy("new_figs/cor_features_by_drug.pdf", "new_figs/rna_cor_features_by_drug.pdf", overwrite = TRUE)
+

For RNA, the “features” are the rows we test in the correlation step, +and they come directly from the feature_id column in the +RNA long table (rlong). In most cases +feature_id is a gene ID (or a transcript ID, depending on +how the RNA file was generated). The workflow builds a sample × feature +matrix from RNA, then correlates every RNA feature against drug +viability for each drug.

+

If a drug shows a large number of significant RNA correlations (after +FDR filtering), that is a strong sign the RNA signal is tracking +response for that drug. In other words, the transcriptome is separating +the sensitive vs resistant samples for that drug in a consistent +way.

+

Below are the plots I show right after running RNA. This is the only +place I show the drug-only plots, because those are based on the drug +response table and are essentially the same no matter which omics table +we are using.

+

Positive Correlation indicates a higher +viability which means that there is lower +effect of the drug / higher resistance to the +drug.

+

Negative Correlation indicates a lower +viability which means that there is greater +effect of the drug / higher sensitivity to the +drug.

+

Metric used: uM_viability

+

Files shown for RNA:
+- new_figs/rna_most_efficacious.pdf
+- new_figs/rna_most_variable.pdf
+- new_figs/rna_drug_heatmap_large_viability.pdf
+- new_figs/rna_cor_features_by_drug.pdf

+
## Converting page 1 to rna_most_efficacious_1.png... done!
+## Converting page 1 to rna_most_variable_1.png... done!
+## Converting page 1 to rna_drug_heatmap_large_viability_1.png... done!
+## Converting page 1 to rna_cor_features_by_drug_1.png... done!
+

+
+
+

Global Proteomics

+
corr_global <- analyze_modality_correlations(
+  df_long     = glong,           
+  sample_col  = "Specimen",
+  feature_col = "Gene",
+  value_col   = "correctedAbundance",
+  drug_mat    = corr_rna$drug_mat,
+  outdir      = "new_figs",
+  fdr_thresh  = 0.25
+)
+
+# Copy the shared filenames to global-specific names
+file.copy("new_figs/cor_features_by_drug.pdf", "new_figs/global_cor_features_by_drug.pdf", overwrite = TRUE)
+

For global proteomics, the features are proteins. In theory, these +may be closer to the phenotype than RNA.

+

Global correlation plot

+
## Converting page 1 to global_cor_features_by_drug_1.png... done!
+

+
+
+

Phospho Proteomics

+
corr_phospho <- analyze_modality_correlations(
+  df_long     = plong,           
+  sample_col  = "Specimen",
+  feature_col = "site",
+  value_col   = "correctedAbundance",
+  drug_mat    = corr_rna$drug_mat,
+  outdir      = "new_figs",
+  fdr_thresh  = 0.25
+)
+
+# Copy the shared filenames to phospho-specific names
+file.copy("new_figs/cor_features_by_drug.pdf", "new_figs/phospho_cor_features_by_drug.pdf", overwrite = TRUE)
+

For phospho, the features are phosphorylation sites. It is normal for +this to be noisier and higher-dimensional, but when a drug shows a +cluster of correlated sites, it can be really informative for the +mechanism - however, there is just one correlation here with 2 +features.

+

Here is the phospho correlation plot - for some reason, this one +doesn’t always knit correctly, so I’m adding the table too.

+
## Converting page 1 to phospho_cor_features_by_drug_1.png... done!
+

+
corr_phospho$cor_summary
+
## # A tibble: 5 × 4
+##   drug        direction features meanCor
+##   <chr>       <chr>        <int>   <dbl>
+## 1 Daporinad   pos              1   0.847
+## 2 ML 210      neg              1  -0.853
+## 3 NVP-BEZ235  pos              2   0.864
+## 4 Tofacitinib pos              1   0.868
+## 5 Tovorafenib pos              1   0.891
+
+
+ + + + +
+ + + + + + + + + + + + + + + diff --git a/analysis/03_pathway_enrichment.Rmd b/analysis/03_pathway_enrichment.Rmd index d866367..fad1c1a 100644 --- a/analysis/03_pathway_enrichment.Rmd +++ b/analysis/03_pathway_enrichment.Rmd @@ -1,8 +1,7 @@ --- title: "Pathway Enrichment" author: "Jeremy Jacobson" -date: "2025-11-12" -output: html_document +date: "`r format(Sys.Date(), '%B %d, %Y')`" --- ### Purpose @@ -37,8 +36,8 @@ knitr::opts_chunk$set(echo = TRUE) ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) -source("../00_cNF_helper_code.R") -source("../03_leapr_biomarker.R") +source("../source/00_cNF_helper_code.R") +source("../source/03_leapr_biomarker.R") ``` ## Load in Data @@ -61,7 +60,7 @@ plong <- readr::read_csv(synGet("syn70078415")$path) ## Run Pathway Enrichment for Each Omics Modality -Below, we run the same workflow three times, once per modality. The main things that change are: +Below, we run the same workflow three times, once per (omics) modality. The main things that change are: - which omics table we use - which column represents the feature - which geneset library we test (krbpaths vs kinasesubstrates) @@ -188,7 +187,7 @@ save_leapr_plots( ) ``` -Here are the phospho pathway plots +Here are the phospho pathway plots. For mirdametinib and olaparib, no significant pathways are seen for the phospho data using drug viability. ```{r phospho_plots, echo=FALSE, message=FALSE, warning=FALSE, fig.height=6.5, fig.width=10, out.width="50%", cache=TRUE} phospho_plot_paths <- c( @@ -200,7 +199,22 @@ phospho_plot_paths <- c( show_plots(phospho_plot_paths, dpi = 240) ``` - + +Overall we can see that the number of drugs with significant pathways vary significantly by omics type. RNA demonstrates 231, Global proteomics has 167, while phosphoproteomics only shows 44 with just 1-4 effected pathways per drug. + +```{r sig_pathways_three_plots_paged, echo=TRUE, message=FALSE, warning=FALSE, fig.width=16, fig.height=6} +alpha <- 0.05 #uses SignedBH_pvalue +page_size <- 85 #number of drugs per plot. If too many, the names will overlap. +plot_sig_pathways_one_omic_paged(res_bio_rna, "rna", alpha = alpha, page_size = page_size) +plot_sig_pathways_one_omic_paged(res_bio_global, "global", alpha = alpha, page_size = page_size) +plot_sig_pathways_one_omic_paged(res_bio_phospho,"phospho",alpha = alpha, page_size = page_size) + +``` + + + + + ## Notes - If you want to force a full rerun (ignore cache), set `always_rerun = TRUE`. diff --git a/analysis/03_pathway_enrichment.html b/analysis/03_pathway_enrichment.html new file mode 100644 index 0000000..4dabc37 --- /dev/null +++ b/analysis/03_pathway_enrichment.html @@ -0,0 +1,616 @@ + + + + + + + + + + + + + + + +Pathway Enrichment + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+

Purpose

+

This script runs pathway enrichment using leapR, using the drug +viability data plus the long-format omics tables (RNA, global, and +phospho).
+These long-format omics tables are batch corrected when necessary and +have been returned from the Normalize Omics Notebook.

+

The main idea is:
+- Split samples into “sensitive” vs “resistant” for each drug (based on +viability).
+- Run enrichment on each side.
+- Save the enrichment tables and plots so we can quickly scan which +pathways show up for which drugs.

+

This file is written to be simple and repeatable. It loads the inputs +from Synapse, runs the same workflow per modality, and stores results in +a cache file so we do not re-run heavy work unless we need to.

+

Note: This will take a couple of hours to run the +first time. Do not overwrite the cache unless +completely necessary.

+
+
+

Get Helper Scripts

+

These scripts contain the functions that we will call below.

+
    +
  • 00_cNF_helper_code.R is a basic helper script (Synapse helpers, +metadata, and shared utilities).
    +
  • +
  • 03_leapr_biomarker.R contains the leapR wrapper functions, caching, +and plotting helpers.
  • +
+
+
+

Load in Data

+

These are the four main inputs we need - just like in +analyze_modality. Drug viability plus the three long-format omics +tables.

+
# Drugs
+drugs <- readr::read_tsv(synGet("syn69947322")$path)
+
+# RNA 
+rlong <- readr::read_csv(synGet("syn71333780")$path)
+
+# Global
+glong <- readr::read_csv(synGet("syn70078416")$path)
+
+# Phospho
+plong <- readr::read_csv(synGet("syn70078415")$path)
+
+
+

Run Pathway Enrichment for Each Omics Modality

+

Below, we run the same workflow three times, once per (omics) +modality. The main things that change are:
+- which omics table we use - which column represents the feature - which +geneset library we test (krbpaths vs kinasesubstrates)

+

Each call to run_leapr_directional_one_cached() does a +few things:
+- Splits samples into “sensitive” vs “resistant” for each drug (based on +viability).
+- Runs enrichment for each drug and direction.
+- Writes CSV outputs (if write_csvs = TRUE).
+- Saves an .Rdata cache so future runs are fast unless inputs or +settings change.

+

One note about plots: run_leapr_directional_one_cached() +computes the enrichment results, but it does not automatically save the +pathway barplots.
+Those are generated by save_leapr_plots(). In this Rmd, I +intentionally save plots for just mirda and +olaparib so we do not generate hundreds of drug PDFs by +accident.

+

So, If you want more plots, just add more drugs to the +drugs = c(...) list for each modality.

+
+

RNA (krbpaths).

+

For RNA, the features come from the feature_id column in +the RNA long table (rlong). In most cases, this is a gene +or transcript ID. Here we use the krbpaths gene set +collection.

+
res_bio_rna <- run_leapr_directional_one_cached(
+  drugs        = drugs,
+  df_long      = rlong,
+  sample_col   = "Specimen",
+  feature_col  = "feature_id",
+  value_col    = "correctedAbundance",
+  omic_label   = "rna",
+  cache_path   = "../leapR_RNA_krbpaths_enrichment_direction_split.Rdata",
+  write_csvs   = TRUE,
+  always_rerun = FALSE,
+  test_one     = FALSE,
+  geneset_name = "krbpaths"
+)
+
+# This is where pathway plot PDFs get created.
+# I keep the list short on purpose so the report stays readable.
+save_leapr_plots(
+  res_bio_rna,
+  omic_label = "rna",
+  drugs = c("Mirdametinib", "olaparib"),
+  top_n = 15
+)
+

Below are the RNA pathway plots created by +save_leapr_plots(). These files get written to +figs/.

+
## Converting page 1 to pathways_Mirdametinib_rna_resistant_top15_1.png... done!
+## Converting page 1 to pathways_Mirdametinib_rna_sensitive_top15_1.png... done!
+## Converting page 1 to pathways_olaparib_rna_resistant_top15_1.png... done!
+## Converting page 1 to pathways_olaparib_rna_sensitive_top15_1.png... done!
+

+
+
+

Global Proteomics (krbpaths)

+

For global proteomics, the features are proteins in the +Gene column. We keep the same krbpaths gene +set collection so RNA and global are easier to compare.

+
res_bio_global <- run_leapr_directional_one_cached(
+  drugs        = drugs,
+  df_long      = glong,
+  sample_col   = "Specimen",
+  feature_col  = "Gene",
+  value_col    = "correctedAbundance",
+  omic_label   = "global",
+  cache_path   = "../leapR_Global_krbpaths_enrichment_direction_split.Rdata",
+  write_csvs   = TRUE,
+  always_rerun = FALSE,
+  test_one     = FALSE,
+  geneset_name = "krbpaths"
+)
+
+save_leapr_plots(
+  res_bio_global,
+  omic_label = "global",
+  drugs = c("Mirdametinib", "olaparib"),
+  top_n = 15
+)
+

Here are the global pathway plots. Same idea as RNA, just using +protein abundance instead of RNA abundance.

+
## Converting page 1 to pathways_Mirdametinib_global_resistant_top15_1.png... done!
+## Converting page 1 to pathways_Mirdametinib_global_sensitive_top15_1.png... done!
+## Converting page 1 to pathways_olaparib_global_resistant_top15_1.png... done!
+## Converting page 1 to pathways_olaparib_global_sensitive_top15_1.png... done!
+

+
+
+

Phospho Proteomics (kinasesubstrates)

+

For phospho, the features are phosphorylation sites in the +site column. Here we use a phospho-specific gene set +library, kinasesubstrates, because site-level maps more +cleanly to kinase activity than to general pathway collections.

+
res_bio_phospho <- run_leapr_directional_one_cached(
+  drugs        = drugs,
+  df_long      = plong,
+  sample_col   = "Specimen",
+  feature_col  = "site",
+  value_col    = "correctedAbundance",
+  omic_label   = "phospho",
+  cache_path   = "../leapR_Phospho_kinasesubstrates_enrichment_direction_split.Rdata",
+  write_csvs   = TRUE,
+  always_rerun = FALSE,
+  test_one     = FALSE,
+  geneset_name = "kinasesubstrates"
+)
+
+save_leapr_plots(
+  res_bio_phospho,
+  omic_label = "phospho",
+  drugs = c("Mirdametinib", "olaparib"),
+  top_n = 15
+)
+

Here are the phospho pathway plots. For mirdametinib and olaparib, no +significant pathways are seen for the phospho data using drug +viability.

+
## Converting page 1 to pathways_Mirdametinib_phospho_resistant_top15_1.png... done!
+## Converting page 1 to pathways_Mirdametinib_phospho_sensitive_top15_1.png... done!
+## Converting page 1 to pathways_olaparib_phospho_resistant_top15_1.png... done!
+## Converting page 1 to pathways_olaparib_phospho_sensitive_top15_1.png... done!
+

+

Overall we can see that the number of drugs with significant pathways +vary significantly by omics type. RNA demonstrates 231, Global +proteomics has 167, while phosphoproteomics only shows 44 with just 1-4 +effected pathways per drug.

+
alpha <- 0.05  #uses SignedBH_pvalue
+page_size <- 85 #number of drugs per plot. If too many, the names will overlap.
+plot_sig_pathways_one_omic_paged(res_bio_rna,    "rna",    alpha = alpha, page_size = page_size)
+

+
plot_sig_pathways_one_omic_paged(res_bio_global, "global", alpha = alpha, page_size = page_size)
+

+
plot_sig_pathways_one_omic_paged(res_bio_phospho,"phospho",alpha = alpha, page_size = page_size)
+

+
+
+
+

Notes

+
    +
  • If you want to force a full rerun (ignore cache), set +always_rerun = TRUE.
    +
  • +
  • If you only want to test quickly on one drug, set +test_one = TRUE (useful for debugging).
    +
  • +
  • If you want plots for more drugs, add them to the +drugs = c(...) list in each save_leapr_plots() +call.
    +
  • +
  • All pathway plot files are saved under figs/ with names +like: +
      +
    • figs/pathways_{resistant|sensitive}_top.pdf.
    • +
  • +
+
+ + + + +
+ + + + + + + + + + + + + + + diff --git a/analysis/README.md b/analysis/README.md index d18082a..b268a43 100644 --- a/analysis/README.md +++ b/analysis/README.md @@ -1,18 +1,19 @@ # cNF multi-omics analysis pipeline overview -Purpose: Reference for running the cNF analysis end-to-end and what each stage produces. +Purpose: cNF batch merging with optional batch correction, drug/omics correlation assessment, pathway enrichment analysis The notebooks (`.Rmd`) are the primary analysis entry points; the `.R` scripts are sourced helpers. +The `.html` files result from the latest run/knit of the R Markdown files. ## Quick run order (and what each file sources) -1) **01_run_normalize_omics.Rmd** - - *Sources:* `../00_cNF_helper_code.R`, `../01_normalize_batchcorrect_omics.R` +1) **[01_run_normalize_omics.Rmd](01_run_normalize_omics.Rmd)** + - *Sources:* [../source/00_cNF_helper_code.R](../source/00_cNF_helper_code.R), [../source/01_normalize_batchcorrect_omics.R](../source/01_normalize_batchcorrect_omics.R) - *Goal:* Per-modality preprocessing/normalization and batch correction (when needed). - *Outputs (examples):* - Batch-corrected long tables written to synapse. - PCA/QC plots for before and after ComBat batch correction (e.g., `globalCorrectedPCA.pdf`, `phosphoCorrectedPCA.pdf`) -2) **02_analyze_modality.Rmd** - - *Sources:* `../00_cNF_helper_code.R`, `../02_analyze_modality_correlations.R` +2) **[02_analyze_modality.Rmd](02_analyze_modality.Rmd)** + - *Sources:* [../source/00_cNF_helper_code.R](../source/00_cNF_helper_code.R), [../source/02_analyze_modality_correlations.R](../source/02_analyze_modality_correlations.R) - *Goal:* Builds the drug response matrix once, then runs per-modality correlations using the long-format omics tables. - *Outputs (written to `outdir`, examples):* - Drug-only plots (written once, based on drug response table): @@ -25,15 +26,15 @@ The notebooks (`.Rmd`) are the primary analysis entry points; the `.R` scripts a - RNA typically runs the end-to-end wrapper (`analyze_modality()`), producing drug-only + modality outputs. - Global/phospho typically run modality-only (`analyze_modality_correlations()`), reusing the RNA-derived `drug_mat`. -3) **03_pathway_enrichment.Rmd** - - *Sources:* `../00_cNF_helper_code.R`, `../03_leapr_biomarker.R` +3) **[03_pathway_enrichment.Rmd](03_pathway_enrichment.Rmd)** + - *Sources:* [../source/00_cNF_helper_code.R](../source/00_cNF_helper_code.R), [../source/03_leapr_biomarker.R](../source/03_leapr_biomarker.R) - *Goal:* Direction-aware pathway enrichment (leapR) using correlated features (resistant & sensitive). - *Outputs (examples):* - Per-drug pathway barplots (e.g., `pathways____top15.pdf`) - Summary plots across drugs (e.g., top recurrent pathways) ## Helper scripts (sourced by notebooks) -- **00_cNF_helper_code.R**: Shared utilities (Synapse helpers, plotting helpers, common metadata). -- **01_normalize_batchcorrect_omics.R**: Normalization / Joining Batches / ComBat batch correction code. -- **02_analyze_modality_correlations.R**: Drug summary plots, correlations (includes separated drug-only + modality-only functions). -- **03_leapr_biomarker.R**: Directional feature ranking + leapR enrichment + pathway plotting. +- **[00_cNF_helper_code.R](../source/00_cNF_helper_code.R)**: Shared utilities (Synapse helpers, plotting helpers, common metadata). +- **[01_normalize_batchcorrect_omics.R](../source/01_normalize_batchcorrect_omics.R)**: Normalization / Joining Batches / ComBat batch correction code. +- **[02_analyze_modality_correlations.R](../source/02_analyze_modality_correlations.R)**: Drug summary plots, correlations (includes separated drug-only + modality-only functions). +- **[03_leapr_biomarker.R](../source/03_leapr_biomarker.R)**: Directional feature ranking + leapR enrichment + pathway plotting. diff --git a/00_cNF_helper_code.R b/source/00_cNF_helper_code.R similarity index 74% rename from 00_cNF_helper_code.R rename to source/00_cNF_helper_code.R index 388c35a..ecea8a4 100644 --- a/00_cNF_helper_code.R +++ b/source/00_cNF_helper_code.R @@ -1,9 +1,9 @@ -#cNF_helper_code.R +#00_cNF_helper_code.R ##standard metadata across all cNFs, including colors if possible library(synapser) -synLogin(authToken = "eyJ0eXAiOiJKV1QiLCJraWQiOiJXN05OOldMSlQ6SjVSSzpMN1RMOlQ3TDc6M1ZYNjpKRU9VOjY0NFI6VTNJWDo1S1oyOjdaQ0s6RlBUSCIsImFsZyI6IlJTMjU2In0.eyJhY2Nlc3MiOnsic2NvcGUiOlsidmlldyIsImRvd25sb2FkIl0sIm9pZGNfY2xhaW1zIjp7fX0sInRva2VuX3R5cGUiOiJQRVJTT05BTF9BQ0NFU1NfVE9LRU4iLCJpc3MiOiJodHRwczovL3JlcG8tcHJvZC5wcm9kLnNhZ2ViYXNlLm9yZy9hdXRoL3YxIiwiYXVkIjoiMCIsIm5iZiI6MTc2MTgzOTMwNCwiaWF0IjoxNzYxODM5MzA0LCJqdGkiOiIyNzg5OSIsInN1YiI6IjM0NTM5NTUifQ.OkzJaWKY7po87mP_iKDNP8ah--BsZpb-mH8U1wP2kj2yqnxi65yi1irAgNNITXB-mQ84ndpdV-9SfNCpkc-pUIenHADpLTMPbh2dMLAiCS77picR-bLlb79c7e005L-7EWuMBPh7RXAXFOcEwY3823Z-RsjKatGYZXCiGaC_I0OPuawh1c3y_c2VVsZUFMdfXJaQH4iaMB5wK9OA_OlI_mnwaYrL1C3ID6ZCuGdO8cR9-fwgu5NzZPq5OOyn11BSblb7NytwaEhhuteHediXBnhGSmfSCUCpQvdpppb03_-Q8DK21oyAgUSygn_h7QFIqtdfzyb8mg300D49al-yAg") +synLogin() syn <- list(get = synapser::synGet, store = synapser::synStore) library(readxl) library(tidyr) diff --git a/01_normalize_batchcorrect_omics.R b/source/01_normalize_batchcorrect_omics.R similarity index 98% rename from 01_normalize_batchcorrect_omics.R rename to source/01_normalize_batchcorrect_omics.R index d263a21..f33c04e 100644 --- a/01_normalize_batchcorrect_omics.R +++ b/source/01_normalize_batchcorrect_omics.R @@ -57,7 +57,7 @@ modified_zscore <- function(x, na.rm = TRUE) { m <- suppressWarnings(stats::median(x, na.rm = na.rm)) md <- suppressWarnings(stats::mad(x, constant = 1, na.rm = na.rm)) if (is.na(md) || md == 0) return(rep(0, length(x))) - 0.6745 * (x - m) / md + 0.6745 * (x - m) / md # used to convert mean absolute deviation to sd } filter_by_missingness <- function(mat) { @@ -118,6 +118,9 @@ make_dropper <- function(substrings) { # x: character vector of file paths # Output: # character vector of basenames + +# NOTE: This is helpful because some "sample columns" are full Windows paths. + basename_only <- function(x) sub("^.*[\\\\/]", "", x) basename_no_ext <- function(x) sub("\\.[^.]+$", "", basename_only(x)) @@ -128,6 +131,10 @@ normalize_specimen_like <- function(x) { # x: character vector # Output: # character vector of normalized specimen-like strings + + # This normalization exists because different sources encode specimen IDs + # in slightly different ways (e.g., "MN-2_T1_organoid" vs "MN.2 T1 Organoids"). + # The goal is to make joins resilient to punctuation/spacing differences. y <- tolower(x) y <- gsub("\\s+", "", y) y <- gsub("\\.", "-", y) @@ -149,6 +156,8 @@ parse_rna_header_triplet <- function(fnames) { # fnames: character vector of RNA sample column names # Output: # data.frame with columns: fname, sample_id, tumor, condition_raw, condition_norm, specimen_norm + + #RNA headers don't have aliquot (unlike global and phospho) toks_list <- strsplit(fnames, "\\.") out <- lapply(seq_along(toks_list), function(i) { toks <- toks_list[[i]] @@ -372,7 +381,7 @@ make_se <- function(wide_df, value_start_col, feature_ids, fnames_df, meta, drop # Output: # SummarizedExperiment with assay "values" and populated colData/rowData all_candidate <- colnames(wide_df)[value_start_col:ncol(wide_df)] - has_pathy <- any(looks_like_sample_header(all_candidate)) + has_pathy <- any(looks_like_sample_header(all_candidate)) # Use only path-like columns as sample measurements. if (has_pathy) { sample_cols <- all_candidate[looks_like_sample_header(all_candidate)] sample_cols <- setdiff(sample_cols, c("Site", "Sequence")) @@ -385,6 +394,7 @@ make_se <- function(wide_df, value_start_col, feature_ids, fnames_df, meta, drop message(" Casting measurement block to numeric") raw_block <- wide_df[, sample_cols, drop = FALSE] + # Clean vals, coerce to numeric, keep original names. clean_block <- as.data.frame( lapply(raw_block, function(col) { if (is.factor(col)) col <- as.character(col) @@ -393,7 +403,7 @@ make_se <- function(wide_df, value_start_col, feature_ids, fnames_df, meta, drop }), check.names = FALSE ) - + # Drop QC/blank samples by regex name patterns. keep <- !drop_name(colnames(clean_block)) if (any(!keep)) { message(" Dropping unwanted sample columns by pattern: ", sum(!keep)) @@ -404,7 +414,7 @@ make_se <- function(wide_df, value_start_col, feature_ids, fnames_df, meta, drop mat <- as.matrix(clean_block) rownames(mat) <- feature_ids colnames(mat) <- sample_cols - + # Sync metadata rows to kept sample columns. fnames_df <- fnames_df %>% dplyr::semi_join(data.frame(fname = sample_cols), by = "fname") parsed_df <- if (tolower(modality) == "rna") parse_rna_header_triplet(fnames_df$fname) else @@ -560,6 +570,7 @@ combine_batches_intersection <- function(se_list) { # SummarizedExperiment containing combined matrix and combined colData message("Combining batches (intersection of features, then cbind samples)") mats <- lapply(se_list, function(se) as.matrix(SummarizedExperiment::assay(se, "values"))) + # Keep shared features across batches for cbind. feats <- Reduce(intersect, lapply(mats, rownames)) feats <- feats[!is.na(feats) & feats != ""] message(" - Intersection feature count: ", length(feats)) @@ -607,6 +618,7 @@ combat_by_cohort <- function(se) { mat[!is.finite(mat)] <- 0 cd <- as.data.frame(SummarizedExperiment::colData(se)) + # Align colData order to matrix column order. cd <- cd[colnames(mat), , drop = FALSE] if (!"cohort" %in% names(cd)) stop("colData must contain 'cohort' for ComBat batching.") diff --git a/02_analyze_modality_correlations.R b/source/02_analyze_modality_correlations.R similarity index 97% rename from 02_analyze_modality_correlations.R rename to source/02_analyze_modality_correlations.R index 893236c..b219816 100644 --- a/02_analyze_modality_correlations.R +++ b/source/02_analyze_modality_correlations.R @@ -89,8 +89,8 @@ make_feature_matrix <- function(df_long, shared_ids, sample_col, feature_col, va tidyr::pivot_wider( names_from = all_of(feature_col), values_from = all_of(value_col), - values_fill = 0, - values_fn = mean + values_fill = 0, # Missing sample-feature pairs become zero for matrix. + values_fn = mean # Average duplicates per sample-feature after pivot. ) %>% as.data.frame(check.names = FALSE) @@ -138,7 +138,7 @@ make_drug_matrix <- function( tidyr::pivot_wider( names_from = all_of(drug_col), values_from = all_of(value_col), - values_fn = mean + values_fn = mean # Average replicate drug responses per sample. ) %>% tibble::column_to_rownames(sample_col) } @@ -223,7 +223,7 @@ compute_cors <- function(drug_mat, feat_mat, shared_samples = NULL) { drug_mat <- drug_mat[shared_samples, , drop = FALSE] feat_mat <- feat_mat[shared_samples, , drop = FALSE] - cres <- suppressWarnings( + cres <- suppressWarnings( # Fast Spearman matrix using pairwise complete observations. stats::cor(drug_mat, feat_mat, use = "pairwise.complete.obs", method = "spearman") ) %>% as.data.frame() %>% @@ -234,7 +234,7 @@ compute_cors <- function(drug_mat, feat_mat, shared_samples = NULL) { do.call(rbind, lapply(colnames(feat_mat), function(f) { dv <- drug_mat[, d]; fv <- feat_mat[, f] p <- NA_real_ - if (sum(is.finite(dv) & is.finite(fv)) >= 3) { + if (sum(is.finite(dv) & is.finite(fv)) >= 3) { # Require ≥3 finite pairs for cor.test p-value. p <- tryCatch( stats::cor.test(dv, fv, method = "spearman", use = "pairwise.complete.obs")$p.value, error = function(e) NA_real_ @@ -328,7 +328,7 @@ analyze_drug_response <- function( # Heatmap (by default) if (!is.null(heatmap_filename) && nrow(drug_mat) > 0 && ncol(drug_mat) > 0) { - fulldrugs <- dsum$summary %>% + fulldrugs <- dsum$summary %>% # Heatmap for only drugs measured in all samples. dplyr::filter(.data$nMeasured == nrow(drug_mat)) %>% pull(.data$improve_drug_id) diff --git a/03_leapr_biomarker.R b/source/03_leapr_biomarker.R similarity index 84% rename from 03_leapr_biomarker.R rename to source/03_leapr_biomarker.R index 32c81e4..a456812 100644 --- a/03_leapr_biomarker.R +++ b/source/03_leapr_biomarker.R @@ -94,8 +94,8 @@ long_to_matrix <- function(df_long, sample_col, feature_col, value_col) { tidyr::pivot_wider( names_from = !!rlang::sym(feature_col), values_from = !!rlang::sym(value_col), - values_fill = 0, - values_fn = mean + values_fill = 0, # Missing sample-feature pairs become zero after pivot. + values_fn = mean # Average duplicates per sample-feature before matrix conversion. ) %>% as.data.frame(check.names = FALSE) @@ -189,7 +189,7 @@ long_to_matrix <- function(df_long, sample_col, feature_col, value_col) { if (is.null(map_site2gene) || !length(cor_named_vec)) return(cor_named_vec) # Align and drop unmapped - genes <- map_site2gene[names(cor_named_vec)] + genes <- map_site2gene[names(cor_named_vec)] # Map each site ID to its gene symbol. keep <- !is.na(genes) & genes != "" v <- cor_named_vec[keep] g <- genes[keep] @@ -325,6 +325,10 @@ long_to_matrix <- function(df_long, sample_col, feature_col, value_col) { # Main # ----------------------------- run_leapr_directional_one_cached <- function( + + # Future Consideration: + # Consider using synapse to store results + # # For each drug, correlate uM_viability with each omics feature across samples, split features # into TOP (positive; more resistant) and BOTTOM (negative; more sensitive), then run leapR # enrichment separately on each direction. Supports phospho-specific site->gene handling, @@ -362,7 +366,7 @@ run_leapr_directional_one_cached <- function( # cache check! If the cached value exists, stop there. if (!always_rerun && is.character(cache_path) && nzchar(cache_path) && file.exists(cache_path)) { - load(cache_path) # loads res_list + load(cache_path) # Skip recompute by loading cached res_list from disk. if (exists("res_list")) return(res_list) } @@ -517,7 +521,7 @@ run_leapr_directional_one_cached <- function( # BOTTOM (sensitive) if (length(neg) >= min_features) { - # flip sign so strong negatives rank to top + # Flip sign so strongest negatives rank highest. neg_flip <- -neg feats_bot <- names(neg_flip) se_bot <- .build_se_from_corvec( @@ -652,3 +656,138 @@ save_leapr_plots <- function( invisible(NULL) } + + + +#Extra plotting functions +count_sig_pathways <- function(res_list, omic_label, alpha = 0.05, pcol = "SignedBH_pvalue") { + if (is.null(res_list) || !length(res_list)) return(tibble()) + + drugs <- names(res_list) + rows <- lapply(drugs, function(drug) { + two <- res_list[[drug]] + if (is.null(two)) return(NULL) + + one_dir <- function(tbl, direction_label) { + if (is.null(tbl) || !nrow(tbl)) { + return(tibble( + omic_label = omic_label, drug = drug, direction = direction_label, + n_sig = 0L, n_total = 0L + )) + } + + df <- as.data.frame(tbl) + + # No fallback: require SignedBH_pvalue + if (!(pcol %in% colnames(df))) { + stop(sprintf( + "[count_sig_pathways] Column '%s' not found for drug='%s', omic='%s', direction='%s'. Columns: %s", + pcol, drug, omic_label, direction_label, paste(colnames(df), collapse = ", ") + )) + } + + pv <- df[[pcol]] + + # Robust whether SignedBH_pvalue is signed or not + tibble( + omic_label = omic_label, + drug = drug, + direction = direction_label, + n_sig = sum(!is.na(pv) & abs(pv) < alpha), + n_total = nrow(df) + ) + } + + bind_rows( + one_dir(two$top, "Resistant (TOP)"), + one_dir(two$bottom, "Sensitive (BOTTOM)") + ) + }) + + bind_rows(rows) +} + + + + + +plot_sig_pathways_one_omic_paged <- function(res_list, omic_label, alpha = 0.05, page_size = 80) { + # Plot (paged) bar charts of the number of significant enriched pathways per drug for a + # single omics modality. Uses a fixed y-axis across all pages, orders drugs globally by + # total significant pathways (desc), and splits drugs into pages to keep plots readable. + # Each page is printed to the current graphics device. + # Inputs: + # res_list: named list of per-drug enrichment results (typically output from leapR runs); + # names(res_list) should be drug names/IDs, and each entry should contain TOP/BOTTOM + # enrichment tables used by count_sig_pathways() + # omic_label: character scalar label for the modality (e.g., "rna", "global", "phospho") + # alpha: numeric significance threshold applied to pcol (default 0.05) + # page_size: integer number of drugs to include per page (default 80) + # Output: + # invisibly returns NULL; produces ggplot2 pages via print(p). If no significant pathways + # are found, prints a message and returns invisibly. + df <- count_sig_pathways(res_list, omic_label, alpha = alpha, pcol = "SignedBH_pvalue") + if (!nrow(df)) return(invisible(NULL)) + + # total sig per drug and drop zeros + totals <- df %>% + group_by(drug) %>% + summarise(sig_total = sum(n_sig, na.rm = TRUE), .groups = "drop") %>% + filter(sig_total > 0) %>% + arrange(desc(sig_total), drug) + + if (!nrow(totals)) { + message("[plot_sig_pathways_one_omic_paged] No drugs with significant pathways for omic='", omic_label, "'.") + return(invisible(NULL)) + } + + # Global ordering + paging + totals <- totals %>% + mutate(global_rank = row_number(), + page = ceiling(global_rank / page_size)) + + df2 <- df %>% + inner_join(totals, by = "drug") + + # Fix y-axis across ALL pages for this omic + ymax <- max(df2$n_sig, na.rm = TRUE) + if (!is.finite(ymax) || ymax < 1) ymax <- 1 + + # Fixed drug order across all pages + ordered_drugs <- totals$drug + n_pages <- max(totals$page, na.rm = TRUE) + + for (pg in seq_len(n_pages)) { + page_drugs <- totals %>% filter(page == pg) %>% pull(drug) + + dpg <- df2 %>% + filter(page == pg) %>% + mutate(drug = factor(drug, levels = page_drugs)) # keep global order within page + + p <- ggplot(dpg, aes(x = drug, y = n_sig, fill = direction)) + + geom_col(position = position_dodge(width = 0.85)) + + scale_y_continuous( + limits = c(0, ymax), + breaks = scales::pretty_breaks(n = 6) + ) + + labs( + title = paste0( + toupper(omic_label), + ": Significant enriched pathways per drug (|SignedBH_pvalue| < ", alpha, ")", + " - page ", pg, " of ", n_pages, " (", length(page_drugs), " drugs)" + ), + x = NULL, + y = "Number of significant pathways", + fill = NULL + ) + + theme_bw() + + theme( + axis.text.x = element_text(angle = 60, hjust = 1, vjust = 1), + legend.position = "right" + ) + + print(p) + } + + invisible(NULL) # This prevents NULL from appearing in the knitted resutls +}