The data were first presented in this publication:
Djomehri, S.I., Gonzalez, M.E., da Veiga Leprevost, F., Tekula, S.R., Chang, H.Y., White, M.J., Cimino-Mathews, A., Burman, B., Basrur, V., Argani, P. and Nesvizhskii, A.I., 2020. Quantitative proteomic landscape of metaplastic breast carcinoma pathological subtypes and their relationship to triple-negative tumors. Nature communications, 11(1), pp.1-15.
The IRS adjusted reporter ion intensities from the PAW pipeline processsing were analyzed with Jupyter notebooks, an R kernel, the Bioconductor package edgeR, and its TMM normalization method. This notebook format allows rich data visualization and quality control checks.
# library imports
library(tidyverse)
library(scales)
library(limma)
library(edgeR)
library(psych)
# ============== CV function ===================================================
CV <- function(df) {
# Computes CVs of data frame rows
# df - data frame,
# returns vector of CVs (%)
ave <- rowMeans(df) # compute averages
sd <- apply(df, 1, sd) # compute standard deviations
cv <- 100 * sd / ave # compute CVs in percent (last thing gets returned)
}
# =========== Boxplot with median label ========================================
labeled_boxplot <- function(df, ylim, title) {
# Makes a box plot with the median value labeled
# df - data frame with data to compute CVs of
# ylim - upper limit for y-axis
# title - plot title
cv = CV(df)
boxplot(cv, ylim = c(0, ylim), notch = TRUE, main = title)
text(x = 0.65, y = boxplot.stats(cv)$stats[3],
labels = round(boxplot.stats(cv)$stats[3], 1))
}
# ================== TMM normalization from DGEList object =====================
apply_tmm_factors <- function(y, color = NULL, plot = TRUE) {
# computes the tmm normalized data from the DGEList object
# y - DGEList object
# returns a dataframe with normalized intensities
# compute and print "Sample loading" normalization factors
lib_facs <- mean(y$samples$lib.size) / y$samples$lib.size
cat("\nLibrary size factors:\n",
sprintf("%-5s -> %f\n", colnames(y$counts), lib_facs))
# compute and print TMM normalization factors
tmm_facs <- 1/y$samples$norm.factors
cat("\nTrimmed mean of M-values (TMM) factors:\n",
sprintf("%-5s -> %f\n", colnames(y$counts), tmm_facs))
# compute and print the final correction factors
norm_facs <- lib_facs * tmm_facs
cat("\nCombined (lib size and TMM) normalization factors:\n",
sprintf("%-5s -> %f\n", colnames(y$counts), norm_facs))
# compute the normalized data as a new data frame
tmt_tmm <- as.data.frame(sweep(y$counts, 2, norm_facs, FUN = "*"))
colnames(tmt_tmm) <- str_c(colnames(y$counts), "_tmm")
# visualize results and return data frame
if(plot == TRUE) {
boxplot(log10(tmt_tmm), col = color, notch = TRUE, main = "TMM Normalized data")
}
tmt_tmm
}
# ================= reformat edgeR test results ================================
collect_results <- function(df, tt, x, xlab, y, ylab) {
# Computes new columns and extracts some columns to make results frame
# df - data in data.frame
# tt - top tags table from edgeR test
# x - columns for first condition
# xlab - label for x
# y - columns for second condition
# ylab - label for y
# returns a new dataframe
# condition average vectors
ave_x <- rowMeans(df[x])
ave_y <- rowMeans(df[y])
# FC, direction, candidates
fc <- ifelse(ave_y > ave_x, (ave_y / ave_x), (-1 * ave_x / ave_y))
direction <- ifelse(ave_y > ave_x, "up", "down")
candidate <- cut(tt$FDR, breaks = c(-Inf, 0.01, 0.05, 0.10, 1.0),
labels = c("high", "med", "low", "no"))
# make data frame
temp <- cbind(df[c(x, y)], data.frame(logFC = tt$logFC, FC = fc,
PValue = tt$PValue, FDR = tt$FDR,
ave_x = ave_x, ave_y = ave_y,
direction = direction, candidate = candidate,
Acc = tt$genes))
# fix column headers for averages
names(temp)[names(temp) %in% c("ave_x", "ave_y")] <- str_c("ave_", c(xlab, ylab))
temp # return the data frame
}
# =============== p-value plots ================================================
pvalue_plots <- function(results, ylim, title) {
# Makes p-value distribution plots
# results - results data frame
# ylim - ymax for expanded view
# title - plot title
p_plot <- ggplot(results, aes(PValue)) +
geom_histogram(bins = 100, fill = "white", color = "black") +
geom_hline(yintercept = mean(hist(results$PValue, breaks = 100,
plot = FALSE)$counts[26:100]))
# we will need an expanded plot
p1 <- p_plot + ggtitle(str_c(title, " p-value distribution"))
p2 <- p_plot + coord_cartesian(xlim = c(0, 1.0), ylim = c(0, ylim)) + ggtitle("p-values expanded")
grid.arrange(p1, p2, nrow = 2) # from gridExtra package
}
# ============= log2 fold-change distributions =================================
log2FC_plots <- function(results, range, title) {
# Makes faceted log2FC plots by candidate
# results - results data frame
# range - plus/minus log2 x-axis limits
# title - plot title
ggplot(results, aes(x = logFC, fill = candidate)) +
geom_histogram(binwidth=0.1, color = "black") +
facet_wrap(~candidate) +
ggtitle(title) +
coord_cartesian(xlim = c(-range, range))
}
# ========== Setup for MA and volcano plots ====================================
transform <- function(results, x, y) {
# Make data frame with some transformed columns
# results - results data frame
# x - columns for x condition
# y - columns for y condition
# return new data frame
df <- data.frame(log10((results[x] + results[y])/2),
log2(results[y] / results[x]),
results$candidate,
-log10(results$FDR))
colnames(df) <- c("A", "M", "candidate", "P")
df # return the data frame
}
# ========== MA plots using ggplot =============================================
MA_plots <- function(results, x, y, title) {
# makes MA-plot DE candidate ggplots
# results - data frame with edgeR results and some condition average columns
# x - string for x-axis column
# y - string for y-axis column
# title - title string to use in plots
# returns a list of plots
# uses transformed data
temp <- transform(results, x, y)
# 2-fold change lines
ma_lines <- list(geom_hline(yintercept = 0.0, color = "black"),
geom_hline(yintercept = 1.0, color = "black", linetype = "dotted"),
geom_hline(yintercept = -1.0, color = "black", linetype = "dotted"))
# make main MA plot
ma <- ggplot(temp, aes(x = A, y = M)) +
geom_point(aes(color = candidate, shape = candidate)) +
scale_y_continuous(paste0("logFC (", y, "/", x, ")")) +
scale_x_continuous("Ave_intensity") +
ggtitle(title) +
ma_lines
# make separate MA plots
ma_facet <- ggplot(temp, aes(x = A, y = M)) +
geom_point(aes(color = candidate, shape = candidate)) +
scale_y_continuous(paste0("log2 FC (", y, "/", x, ")")) +
scale_x_continuous("log10 Ave_intensity") +
ma_lines +
facet_wrap(~ candidate) +
ggtitle(str_c(title, " (separated)"))
# make the plots visible
print(ma)
print(ma_facet)
}
# ========== Scatter plots using ggplot ========================================
scatter_plots <- function(results, x, y, title) {
# makes scatter-plot DE candidate ggplots
# results - data frame with edgeR results and some condition average columns
# x - string for x-axis column
# y - string for y-axis column
# title - title string to use in plots
# returns a list of plots
# 2-fold change lines
scatter_lines <- list(geom_abline(intercept = 0.0, slope = 1.0, color = "black"),
geom_abline(intercept = 0.301, slope = 1.0, color = "black", linetype = "dotted"),
geom_abline(intercept = -0.301, slope = 1.0, color = "black", linetype = "dotted"),
scale_y_log10(),
scale_x_log10())
# make main scatter plot
scatter <- ggplot(results, aes_string(x, y)) +
geom_point(aes(color = candidate, shape = candidate)) +
ggtitle(title) +
scatter_lines
# make separate scatter plots
scatter_facet <- ggplot(results, aes_string(x, y)) +
geom_point(aes(color = candidate, shape = candidate)) +
scatter_lines +
facet_wrap(~ candidate) +
ggtitle(str_c(title, " (separated)"))
# make the plots visible
print(scatter)
print(scatter_facet)
}
# ========== Volcano plots using ggplot ========================================
volcano_plot <- function(results, x, y, title) {
# makes a volcano plot
# results - a data frame with edgeR results
# x - string for the x-axis column
# y - string for y-axis column
# title - plot title string
# uses transformed data
temp <- transform(results, x, y)
# build the plot
ggplot(temp, aes(x = M, y = P)) +
geom_point(aes(color = candidate, shape = candidate)) +
xlab("log2 FC") +
ylab("-log10 FDR") +
ggtitle(str_c(title, " Volcano Plot"))
}
# ============== individual protein expression plots ===========================
# function to extract the identifier part of the accesssion
get_identifier <- function(accession) {
identifier <- str_split(accession, "\\|", simplify = TRUE)
identifier[,3]
}
set_plot_dimensions <- function(width_choice, height_choice) {
options(repr.plot.width=width_choice, repr.plot.height=height_choice)
}
plot_top_tags <- function(results, nleft, nright, top_tags) {
# results should have data first, then test results (two condition summary table)
# nleft, nright are number of data points in each condition
# top_tags is number of up and number of down top DE candidates to plot
# get top ipregulated
up <- results %>%
filter(logFC >= 0) %>%
arrange(FDR)
up <- up[1:top_tags, ]
# get top down regulated
down <- results %>%
filter(logFC < 0) %>%
arrange(FDR)
down <- down[1:top_tags, ]
# pack them
proteins <- rbind(up, down)
color = c(rep("red", nleft), rep("blue", nright))
for (row_num in 1:nrow(proteins)) {
row <- proteins[row_num, ]
vec <- as.vector(unlist(row[1:(nleft + nright)]))
names(vec) <- colnames(row[1:(nleft + nright)])
title <- str_c(get_identifier(row$Acc), ", int: ", scientific(mean(vec), 2),
", FDR: ", scientific(row$FDR, digits = 3),
", FC: ", round(row$FC, digits = 1),
", ", row$candidate)
barplot(vec, col = color, main = title,
cex.main = 1.0, cex.names = 0.7, cex.lab = 0.7)
}
}
The pandas Python script that does the IRS normalization arranges the tab-delimited table so that importing into R is relatively straightforward. Note than this experimental design has only a single reference channel per plex instead of two.
We need to drop contaminant and decoy proteins, and proteins with missing sets of reporter ions. We extract the accessions column, the SL and IRS normed data columns, and collect channels by biological condition. We also grab the reference channels to check the IRS procedure.
# load the IRS-normalized data and check the table
data_import <- read_tsv("labeled_grouped_protein_summary_TMT_9_IRS_normalized.txt", guess_max = 5250)
# "Filter" column flags contams and decoys
# "Missing" column flags proteins without reporter ion intensities (full sets missing)
# the table from pandas is sorted so the rows we want come first
data_all <- filter(data_import, is.na(Filter), is.na(Missing))
data_sl <- data_all %>% select(., contains("SLNorm_")) %>%
select(., -contains("_Unused")) %>%
select(., -contains("_Pool"))
data_irs <- data_all %>% select(., contains("IRSNorm_")) %>%
select(., -contains("_Unused")) %>%
select(., -contains("_Pool"))
data_pool <- data_all %>% select(., contains("_Pool_"))
# save a few columns for the results table
all_results <- data_all %>% select(., ProtGroup, Counter, Accession, Description, starts_with("PSMs_Used"))
# save gene names for edgeR so we can double check that results line up
accessions <- data_all$Accession
# see how many rows of data we have
length(accessions)
We will lump all the metaplastic breast cancer (MBC) samples together to contrast to triple negative breast cancer and normal tissue samples. There will be 14 mbc, 6 triple negative, and 7 normal samples.
# all categories of metaplastic breast cancer tissue
mbc_sl <- select(data_sl, contains("_C"), contains("_SP"), contains("_SQ"))
mbc_irs <- select(data_irs, contains("_C"), contains("_SP"), contains("_SQ"))
# triple negative breast cancer tissue
tn_sl <- select(data_sl, contains("_TN"))
tn_irs <- select(data_irs, contains("_TN"))
# Normal tissue
n_sl <- select(data_sl, contains("_N"))
n_irs <- select(data_irs, contains("_N"))
# collect the biological replicate channels
bio_sl <- cbind(n_sl, tn_sl, mbc_sl)
bio_irs <- cbind(n_irs, tn_irs, mbc_irs)
# set a color vector for plots
color <- c(rep("red", 7), rep("blue", 6), rep("green", 14))
set_plot_dimensions(9, 9)
Run TMM normalization on the IRS data.
# put groups together into a single data frame
tmt_sl <- bio_sl
tmt_irs <- bio_irs
# define the positions of the groups
N <- 1:7
TN <- 8:13
MP <- 14:27
# set some colors by condition
group <- c(rep("N", 7), rep("TN", 6), rep("MBC", 14))
# get the biological sample data into a DGEList object
y <- DGEList(counts = tmt_irs, group = group, genes = accessions)
# run TMM normalization (also includes a library size factor)
y <- calcNormFactors(y)
tmt_tmm <- apply_tmm_factors(y, color = color)
# check the clustering
plotMDS(y, col = color, main = "all samples after TMM")
We will run the glm modeling in edgeR with the quasi-likelihood functions.
We will add the statistical testing results (logFC, p-values, and FDR), condition intensity averages, and candidate status to the results
data frame (which has the TMM-normalized data) and also accumulate all of the three comparisons into all_results
.
# set up the design matrix
group <- as.factor(c(rep("N", length(N)), rep("TN", length(TN)), rep("MP", length(MP))))
group <- factor(group, levels = c("N", "TN", "MP")) # set the factor order
design <- model.matrix(~ 0 + group)
colnames(design) <- c("N", "TN", "MP")
# estimate dispersion in y
y <- estimateDisp(y, design, robust = TRUE)
# fit statistical models (design matrix already in y$design)
fit <- glmQLFit(y, design, robust = TRUE)
plotQLDisp(fit)
# make the contrast
contrast <- makeContrasts(N-TN, levels = design)
contrast
# do the empirical Bayes moderation of the test statistic (with trended variance)
fit2 <- glmQLFTest(fit, contrast = contrast)
# check test results
topTags(fit2)$table
tt <- topTags(fit2, n = Inf, sort.by = "none")$table
# let's see how many up and down candidates, and the top tags
summary(decideTests(fit2, p.value = 0.05))
# get the results summary
results <- collect_results(tmt_tmm, tt, N, "N", TN, "TN")
# check the p-value distribution
ggplot(results, aes(PValue)) +
geom_histogram(bins = 100, fill = "white", color = "black") +
geom_hline(yintercept = mean(hist(results$PValue, breaks = 100,
plot = FALSE)$counts[26:100])) +
ggtitle("N versus TN p-value distribution")
We have a modest number of DE candidates. The p-value distribution suggests that the low p-value candidates are "soft" (not really small p-values) because the "spike" is more of a narrow distribution.
# make column names unique by adding comparison (for the accumulated frame)
results_temp <- results
colnames(results_temp) <- str_c(colnames(results), "_N_TN")
# accumulate the testing results
all_results <- cbind(all_results, results_temp)
We will still make the MA plots, scatter plots, and volcano plot using ggplot2. We will also look at the distribution of log2 expression ratios separated by differential expression (DE) category. The Benjamini-Hochberg corrected edgeR p-values are used to categorize the DE candidates: no > 0.10 > low > 0.05 > med > 0.01 > high.
# see how many candidates by category
results %>% count(candidate)
# plot log2 fold-changes by category
ggplot(results, aes(x = logFC, fill = candidate)) +
geom_histogram(binwidth=0.1, color = "black") +
facet_wrap(~candidate) +
coord_cartesian(xlim = c(-4, 4)) +
ggtitle("N vs TN logFC distributions by candidate")
We have many comparisons to visualize, so we will use functions to generate a series of plots. We will make: an MA plot with candidates highlighted by color, faceted MA plots separated by candidate status, a scatter plot with candidates highlighted by color, faceted scatter plots separated by candidate status, and a volcano plot with candidates highlighted by color. The solid black lines in the MA and scatter plots are the 1-to-1 lines; the dotted lines are 2-fold change lines.
# make MA plots
MA_plots(results, "ave_N", "ave_TN", "N vs TN")
We do have a distinction between the purple plot and the orange plot. Our highly sifnificant expressed proteins (orange) have a distinct expression pattern compared to the not differentially expressed (purple) proteins.
# make scatter plots
scatter_plots(results, "ave_N", "ave_TN", "N vs TN")
# make a volcano plot
volcano_plot(results, "ave_N", "ave_TN", "N vs TN")
We have a good "V" shape in the volvano plot. The magnitudes of the adjusted p-values are not very small.
We can look at the data for the top 20 (by edgeR BH-corrected p-values) up and down regulated proteins.
# look at the top 20 candidates in each direction (up in TN, then down in TN)
set_plot_dimensions(9, 3.5)
plot_top_tags(results, 7, 6, 20)
set_plot_dimensions(9, 9)
We will do the same testing to compare normal tissue to metaplastic breast cancer samples.
# make the contrast
contrast <- makeContrasts(N-MP, levels = design)
contrast
# do the empirical Bayes moderation of the test statistic (with trended variance)
fit2 <- glmQLFTest(fit, contrast = contrast)
# check test results
topTags(fit2)$table
tt <- topTags(fit2, n = Inf, sort.by = "none")$table
# let's see how many up and down candidates, and the top tags
summary(decideTests(fit2, p.value = 0.05))
# get the results summary
results <- collect_results(tmt_tmm, tt, N, "N", MP, "MP")
# check the p-value distribution
ggplot(results, aes(PValue)) +
geom_histogram(bins = 100, fill = "white", color = "black") +
geom_hline(yintercept = mean(hist(results$PValue, breaks = 100,
plot = FALSE)$counts[26:100])) +
ggtitle("N versus MP p-value distribution") # check the p-value distribution
We have 820 candidates in this comparison. The p-value distribution looks better.
We will add the statistical testing results (logFC, p-values, and FDR), condition intensity averages, and candidate status to the results
data frame (which has the TMM-normalized data) and also accumulate all of the three comparisons into all_results
.
# make column names unique by adding comparison
results_temp <- results
colnames(results_temp) <- str_c(colnames(results), "_N_MP")
# accumulate the testing results
all_results <- cbind(all_results, results_temp)
# see how many candidates by category
results %>% count(candidate)
# plot log2 fold-changes by category
ggplot(results, aes(x = logFC, fill = candidate)) +
geom_histogram(binwidth=0.1, color = "black") +
facet_wrap(~candidate) +
coord_cartesian(xlim = c(-4, 4)) +
ggtitle("N vs MP logFC distributions by candidate")
# make MA plots
MA_plots(results, "ave_N", "ave_MP", "N vs MP")
# make scatter plots
scatter_plots(results, "ave_N", "ave_MP", "N vs MP")
# make a volcano plot
volcano_plot(results, "ave_N", "ave_MP", "N vs MP")
We have similar more robust down-regulation in breast cancer compared to normal tissue. We may have more DE candidates compared to the first comparison because we have 14 MBC samples compared to 6 for triple negative. Generally, we should have more statistical power with larger replicate numbers (all things being equal).
# look at the top 20 candidates (up in MP, then down in MP)
set_plot_dimensions(9, 3.5)
plot_top_tags(results, 7, 14, 20)
set_plot_dimensions(9, 9)
There are some sample to sample variations in line with what one expects for human subjects. However, we do have clear differences in expression levels.
We can compare the non-metaplastic breast cancer samples to the metaplastic samples. These samples were not very distinct from each other in the cluster plots.
# make the contrast
contrast <- makeContrasts(TN-MP, levels = design)
contrast
# do the empirical Bayes moderation of the test statistic (with trended variance)
fit2 <- glmQLFTest(fit, contrast = contrast)
# check test results
topTags(fit2)$table
tt <- topTags(fit2, n = Inf, sort.by = "none")$table
# let's see how many up and down candidates, and the top tags
summary(decideTests(fit2, p.value = 0.05))
# get the results summary
results <- collect_results(tmt_tmm, tt, TN, "TN", MP, "MP")
# check the p-value distribution
ggplot(results, aes(PValue)) +
geom_histogram(bins = 100, fill = "white", color = "black") +
geom_hline(yintercept = mean(hist(results$PValue, breaks = 100,
plot = FALSE)$counts[26:100])) +
ggtitle("TN versus MP p-value distribution") # check the p-value distribution
# make column names unique by adding comparison
results_temp <- results
colnames(results_temp) <- str_c(colnames(results), "_TN_MP")
# accumulate the testing results
all_results <- cbind(all_results, results_temp)
# make MA plots
MA_plots(results, "ave_TN", "ave_MP", "TN vs MP")
# make scatter plots
scatter_plots(results, "ave_TN", "ave_MP", "TN vs MP")
# make a volcano plot
volcano_plot(results, "ave_TN", "ave_MP", "TN vs MP")
# look at the top 10 candidates (up in MP, then down in MP)
set_plot_dimensions(9, 3.5)
plot_top_tags(results, 6, 14, 10)
set_plot_dimensions(9, 9)
We had differences between normal tissue samples and either set of cancer samples. The glm modeling in edgeR seems a little more conservative than the exact test.
all_results
frame to TSV file¶# write the results to disk
write.table(all_results, "results_edgeR-glm.txt", sep = "\t",
row.names = FALSE, na = " ")
# log the session details
sessionInfo()