# library imports
library(tidyverse) # ggplot2 and much more
library(psych) # scatter plot panels
library(edgeR) # TMM normalization
library(robustbase) # column medians
library(preprocessCore) # from Bioconductor - quantile normalization

Warning message:
“replacing previous import ‘lifecycle::last_warnings’ by ‘rlang::last_warnings’ when loading ‘tibble’”
── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──

✔ ggplot2 3.3.2     ✔ purrr   0.3.4
✔ tibble  3.0.3     ✔ dplyr   1.0.2
✔ tidyr   1.1.1     ✔ stringr 1.4.0
✔ readr   1.3.1     ✔ forcats 0.5.0

── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


Attaching package: ‘psych’


The following objects are masked from ‘package:ggplot2’:

    %+%, alpha


Loading required package: limma


# load the PAW data file (from IRS script) and check the dta frame
# read_tsv is more picky about column types - increase the "guess" range
data_import <- read_tsv("pooled_grouped_protein_summary_TMT_IRS_normalized.txt",
                       guess_max = 3675)

# get the non-contaminant proteins seen in all 7 plexes (2152 proteins)
# the prepped table from pandas is sorted so these are the upper rows
data_intersect <- filter(data_import, is.na(Filter), is.na(Missing))
nrow(data_import)
nrow(data_intersect)

Parsed with column specification:
cols(
  .default = col_double(),
  Accession = col_character(),
  Identical = col_character(),
  Similar = col_character(),
  OtherLoci = col_character(),
  Filter = col_character(),
  Missing = col_character(),
  Coverage = col_character(),
  SeqLength = col_character(),
  MW = col_character(),
  Description = col_character()
)

See spec(...) for full column specifications.


# we will be most interested in the 3 plexes with 3 reference channels
# those are experiments 2, 4, and 6
exp2_raw <- data_intersect %>% 
  select(matches("TotInt_.*_Exp2")) %>%
  select(contains("_130C_"), contains("_131"))

exp4_raw <- data_intersect %>% 
  select(matches("TotInt_.*_Exp4")) %>%
  select(contains("_130C_"), contains("_131"))

exp6_raw <- data_intersect %>% 
  select(matches("TotInt_.*_Exp6")) %>%
  select(contains("_130C_"), contains("_131"))

# each data frame is 3 columns (9, 10, and 11 channels of each plex)


# make a data summary function for some data printouts and data visualizations
# helper CV functions
CV <- function(df) {
    # Computes and returns a vecotr of CV values
        # df - data frame: CV computed per row
    
    ave <- rowMeans(df) # compute averages
    sd <- apply(df, 1, sd) # compute standard deviations
    cv <- 100 * sd / ave # compute CVs in percent (last thing gets returned)
}

check_data <- function(df, title) {
    # Checks data normalizations with distribution summaries and plots
        # df - data frame (expects 3x3 or 3 for this notebook)
        # title - some text to use in print statements and plot labels

    # get some summary numbers: column totals and medians
    cat(sprintf("\nColumn Summaries (%s):\n", title))
    for (i in 1:ncol(df)) {
        cat(sprintf("  %s - tot: %s med: %s iqr: %s\n", colnames(df)[i],
                format(round(sum(pull(df[i])), digits = 0), big.mark = ','),
                format(round(median(pull(df[i])), digits = 0), big.mark = ','),
                format(round(IQR(pull(df[i])), digits = 0), big.mark = ',')))
    }    
    # see what the data distribution boxplot look like
    boxplot(log10(df), 
            col = c(rep(c("red", "blue", "green"), each = 3)), 
            notch = TRUE, main = sprintf("Intensity distributions (%s)", title))
        
    # CV distributions
    if (ncol(df) == 9) {
        CVs <- data.frame(exp2_CV = CV(df[1:3]), exp4_CV = CV(df[4:6]), exp6_CV = CV(df[7:9]))
        ymax = 50
    } else {
        CVs <- data.frame(Extras = CV(df[1:3]))
        ymax = 100
    }
    # print values
    cat(sprintf("\nMedian CVs (%s):\n", title))
    for (i in 1:ncol(CVs)) {
        cat(sprintf("  %s - med: %s iqr: %s\n", colnames(CVs)[i],
                round(median(pull(CVs[i])), digits = 2),
                round(IQR(pull(CVs[i])), digits = 2)))
    }
    # show boxplots
    boxplot(CVs, ylim = c(0, ymax), notch = TRUE, main = sprintf("CV distributions (%s)", title))
}

# put the 3 experiment in one data frame and check the data
all_raw <- data.frame(exp2_raw, exp4_raw, exp6_raw)
check_data(all_raw, "raw data")

Column Summaries (raw data):
  TotInt_130C_Exp2 - tot: 1,960,333,996 med: 110,104 iqr: 371,355
  TotInt_131N_Exp2 - tot: 1,839,411,554 med: 105,074 iqr: 351,433
  TotInt_131C_Exp2 - tot: 1,826,650,885 med: 104,814 iqr: 354,742
  TotInt_130C_Exp4 - tot: 1,330,669,102 med: 81,571 iqr: 248,739
  TotInt_131N_Exp4 - tot: 1,237,179,672 med: 77,382 iqr: 228,845
  TotInt_131C_Exp4 - tot: 1,111,150,604 med: 68,496 iqr: 204,376
  TotInt_130C_Exp6 - tot: 1,180,655,739 med: 60,404 iqr: 216,738
  TotInt_131N_Exp6 - tot: 1,219,129,046 med: 64,132 iqr: 227,358
  TotInt_131C_Exp6 - tot: 1,319,485,897 med: 70,959 iqr: 247,425

Median CVs (raw data):
  exp2_CV - med: 5.53 iqr: 4.63
  exp4_CV - med: 10.13 iqr: 5.21
  exp6_CV - med: 8.56 iqr: 5.26


# function to do grand total (sample loading) normalization
SL_norm <- function(df, print_factors = TRUE) {
    # Normalizes each channel's sum to the average grand total
        # df: data frame of TMT data (one column for each channel)
        # print_factors: logical to control printing
    
    # compute norm factors and scale columns
    norm_facs <- mean(colSums(df)) / colSums(df)
    df_sl  <- sweep(df, 2, norm_facs, FUN = "*")

    # print the normalization factors for QC check
    if (print_factors == TRUE) {
        cat("\nSample Loading Normalization Factors:\n ")
        cat(sprintf("%s - %0.3f\n", colnames(df), norm_facs))
    }
    
    df_sl # return normalized data
}

# SL norm the entire data subset
all_sl <- SL_norm(data.frame(exp2_raw, exp4_raw, exp6_raw))

Sample Loading Normalization Factors:
 TotInt_130C_Exp2 - 0.738
 TotInt_131N_Exp2 - 0.787
 TotInt_131C_Exp2 - 0.792
 TotInt_130C_Exp4 - 1.088
 TotInt_131N_Exp4 - 1.170
 TotInt_131C_Exp4 - 1.302
 TotInt_130C_Exp6 - 1.226
 TotInt_131N_Exp6 - 1.187
 TotInt_131C_Exp6 - 1.097


# check the SL normalized data
check_data(all_sl, "SL norm data")

Column Summaries (SL norm data):
  TotInt_130C_Exp2 - tot: 1,447,185,166 med: 81,282 iqr: 274,147
  TotInt_131N_Exp2 - tot: 1,447,185,166 med: 82,668 iqr: 276,495
  TotInt_131C_Exp2 - tot: 1,447,185,166 med: 83,040 iqr: 281,048
  TotInt_130C_Exp4 - tot: 1,447,185,166 med: 88,713 iqr: 270,519
  TotInt_131N_Exp4 - tot: 1,447,185,166 med: 90,517 iqr: 267,690
  TotInt_131C_Exp4 - tot: 1,447,185,166 med: 89,210 iqr: 266,183
  TotInt_130C_Exp6 - tot: 1,447,185,166 med: 74,040 iqr: 265,665
  TotInt_131N_Exp6 - tot: 1,447,185,166 med: 76,129 iqr: 269,889
  TotInt_131C_Exp6 - tot: 1,447,185,166 med: 77,826 iqr: 271,371

Median CVs (SL norm data):
  exp2_CV - med: 4.11 iqr: 4.77
  exp4_CV - med: 4.45 iqr: 5.26
  exp6_CV - med: 4.85 iqr: 6.04


# function to do medain intensity normalization
med_norm <- function(df, print_factors = TRUE) {
    # Normalizes each channel's sum to the average grand total
        # df: data frame of TMT data (one column for each channel)
        # print_factors: logical to control printing
    
    # compute norm factors and scale columns
    norm_facs <- mean(colMedians(as.matrix(df))) / colMedians(as.matrix(df))
    df_med  <- sweep(df, 2, norm_facs, FUN = "*")

    # print the normalization factors for QC check
    if (print_factors == TRUE) {
        cat("\nMedian Normalization Factors:\n ")
        cat(sprintf("%s - %0.3f\n", colnames(df), norm_facs))
    }
    
    df_med # return normalized data
}

# normalize columns to same median intensity
all_med <- med_norm(all_raw)

Median Normalization Factors:
 TotInt_130C_Exp2 - 0.750
 TotInt_131N_Exp2 - 0.786
 TotInt_131C_Exp2 - 0.788
 TotInt_130C_Exp4 - 1.012
 TotInt_131N_Exp4 - 1.067
 TotInt_131C_Exp4 - 1.205
 TotInt_130C_Exp6 - 1.367
 TotInt_131N_Exp6 - 1.287
 TotInt_131C_Exp6 - 1.163


# check the median intensity scaled data
check_data(all_med, "Median norm data")

Column Summaries (Median norm data):
  TotInt_130C_Exp2 - tot: 1,469,727,472 med: 82,548 iqr: 278,417
  TotInt_131N_Exp2 - tot: 1,445,083,411 med: 82,548 iqr: 276,093
  TotInt_131C_Exp2 - tot: 1,438,611,995 med: 82,548 iqr: 279,383
  TotInt_130C_Exp4 - tot: 1,346,617,925 med: 82,548 iqr: 251,721
  TotInt_131N_Exp4 - tot: 1,319,778,345 med: 82,548 iqr: 244,123
  TotInt_131C_Exp4 - tot: 1,339,113,775 med: 82,548 iqr: 246,305
  TotInt_130C_Exp6 - tot: 1,613,490,278 med: 82,548 iqr: 296,195
  TotInt_131N_Exp6 - tot: 1,569,218,554 med: 82,548 iqr: 292,647
  TotInt_131C_Exp6 - tot: 1,534,989,099 med: 82,548 iqr: 287,836

Median CVs (Median norm data):
  exp2_CV - med: 4.29 iqr: 4.81
  exp4_CV - med: 4.53 iqr: 5.21
  exp6_CV - med: 4.6 iqr: 5.95


# we will load the data into a DGEList object in edgeR then run TMM
all_raw <- data.frame(exp2_raw, exp4_raw, exp6_raw)
y <- DGEList(counts = all_raw, group = rep(1, 9))
y <- calcNormFactors(y)

# check that library sizes and normalization factors
y$sample


# use the TMM and lib.size factors to get a normalized data table
all_tmm <- sweep(all_raw, 2, y$samples$lib.size * y$samples$norm.factors, FUN = "/")

# the data above will sum up to near 1.0, we want the natural intensity scale
all_tmm <- all_tmm * mean(y$samples$lib.size)

# see if column totals are the same, etc.
check_data(all_tmm, "TMM norm data")

Column Summaries (TMM norm data):
  TotInt_130C_Exp2 - tot: 1,463,078,168 med: 82,175 iqr: 277,158
  TotInt_131N_Exp2 - tot: 1,450,534,221 med: 82,860 iqr: 277,135
  TotInt_131C_Exp2 - tot: 1,446,074,328 med: 82,976 iqr: 280,833
  TotInt_130C_Exp4 - tot: 1,369,453,052 med: 83,948 iqr: 255,989
  TotInt_131N_Exp4 - tot: 1,369,374,971 med: 85,650 iqr: 253,297
  TotInt_131C_Exp4 - tot: 1,372,576,879 med: 84,611 iqr: 252,460
  TotInt_130C_Exp6 - tot: 1,548,484,879 med: 79,223 iqr: 284,261
  TotInt_131N_Exp6 - tot: 1,509,881,782 med: 79,427 iqr: 281,581
  TotInt_131C_Exp6 - tot: 1,507,560,217 med: 81,073 iqr: 282,692

Median CVs (TMM norm data):
  exp2_CV - med: 4.17 iqr: 4.82
  exp4_CV - med: 4.48 iqr: 5.31
  exp6_CV - med: 4.46 iqr: 5.95


# this function comes from Bioconductor
all_quantile <- as.data.frame(normalize.quantiles(as.matrix(all_raw)))
colnames(all_quantile) <- colnames(all_raw) # add back the column names

# get the quantile norm data summaries
check_data(all_quantile, "Quantile norm data")

Column Summaries (Quantile norm data):
  TotInt_130C_Exp2 - tot: 1,447,184,649 med: 82,548 iqr: 272,334
  TotInt_131N_Exp2 - tot: 1,447,185,066 med: 82,548 iqr: 272,334
  TotInt_131C_Exp2 - tot: 1,447,185,066 med: 82,548 iqr: 272,334
  TotInt_130C_Exp4 - tot: 1,447,185,166 med: 82,548 iqr: 272,334
  TotInt_131N_Exp4 - tot: 1,447,185,066 med: 82,548 iqr: 272,334
  TotInt_131C_Exp4 - tot: 1,447,183,286 med: 82,548 iqr: 272,334
  TotInt_130C_Exp6 - tot: 1,447,183,662 med: 82,548 iqr: 272,334
  TotInt_131N_Exp6 - tot: 1,447,186,787 med: 82,548 iqr: 272,334
  TotInt_131C_Exp6 - tot: 1,447,184,649 med: 82,548 iqr: 272,334

Median CVs (Quantile norm data):
  exp2_CV - med: 4.21 iqr: 4.76
  exp4_CV - med: 4.71 iqr: 5.76
  exp6_CV - med: 4.35 iqr: 5.69


# set indices for experiment 4
exp4 <- 4:6

# plot the raw data
pairs.panels(log10(all_raw[exp4]), lm = TRUE, main = "Exp 4 - RAW")


# plot the SL norm data
pairs.panels(log10(all_sl[exp4]), lm = TRUE, main = "Exp 4 - SL norm")


# plot the median norm data
pairs.panels(log10(all_med[exp4]), lm = TRUE, main = "Exp 4 - Median Int norm")


# plot the TMM norm data
pairs.panels(log10(all_tmm[exp4]), lm = TRUE, main = "Exp 4 - TMM norm")


# plot the Quantile norm data
pairs.panels(log10(all_quantile[exp4]), lm = TRUE, main = "Exp 4 - Quantile norm")


# extra channels from RAW data
extras <- c(1, 4, 7)
check_data(all_raw[extras], "Extra channels - raw")

Column Summaries (Extra channels - raw):
  TotInt_130C_Exp2 - tot: 1,960,333,996 med: 110,104 iqr: 371,355
  TotInt_130C_Exp4 - tot: 1,330,669,102 med: 81,571 iqr: 248,739
  TotInt_130C_Exp6 - tot: 1,180,655,739 med: 60,404 iqr: 216,738

Median CVs (Extra channels - raw):
  Extras - med: 39.41 iqr: 30.96


# check the effect of TMM norm on the extra channels
check_data(all_tmm[extras], "Extra channels - TMM norm")

Column Summaries (Extra channels - TMM norm):
  TotInt_130C_Exp2 - tot: 1,463,078,168 med: 82,175 iqr: 277,158
  TotInt_130C_Exp4 - tot: 1,369,453,052 med: 83,948 iqr: 255,989
  TotInt_130C_Exp6 - tot: 1,548,484,879 med: 79,223 iqr: 284,261

Median CVs (Extra channels - TMM norm):
  Extras - med: 28.34 iqr: 29.24


# check effect of quantile norm on extra channels
check_data(all_quantile[extras], "Extra channels - Quantile norm")

Column Summaries (Extra channels - Quantile norm):
  TotInt_130C_Exp2 - tot: 1,447,184,649 med: 82,548 iqr: 272,334
  TotInt_130C_Exp4 - tot: 1,447,185,166 med: 82,548 iqr: 272,334
  TotInt_130C_Exp6 - tot: 1,447,183,662 med: 82,548 iqr: 272,334

Median CVs (Extra channels - Quantile norm):
  Extras - med: 28.1 iqr: 28.59


# raw data first
pairs.panels(log10(all_raw[extras]), lm = TRUE, main = "Extras - raw data")


# SL norm data
pairs.panels(log10(all_sl[extras]), lm = TRUE, main = "Extras - SL norm data")


# Median norm data
pairs.panels(log10(all_med[extras]), lm = TRUE, main = "Extras - Median norm data")


# TMM norm data
pairs.panels(log10(all_tmm[extras]), lm = TRUE, main = "Extras - TMM norm data")


# quantile norm data
pairs.panels(log10(all_quantile[extras]), lm = TRUE, main = "Extras - Quantile norm data")


pairs.panels(log10(all_tmm), lm = TRUE, main = "All 9 channels - TMM norm")


# let's do IRS by hand

# make data frame with averages of the two references per plex
irs_factors <- data.frame(avePoolExp2 = rowMeans(all_sl[2:3]), 
                          avePoolExp4 = rowMeans(all_sl[5:6]), 
                          avePoolExp6 = rowMeans(all_sl[8:9]))

irs_factors$geomean <- apply(irs_factors, 1, function(x) exp(mean(log(x))))
                             
irs_factors$fac2 <- irs_factors$geomean / irs_factors$avePoolExp2
irs_factors$fac4 <- irs_factors$geomean / irs_factors$avePoolExp4
irs_factors$fac6 <- irs_factors$geomean / irs_factors$avePoolExp6
                             
head(irs_factors)


# make new data frame with normalized data
all_irs <- all_sl[1:3] * irs_factors$fac2
all_irs <- cbind(all_irs, all_sl[4:6] * irs_factors$fac4)
all_irs <- cbind(all_irs, all_sl[7:9] * irs_factors$fac6)

# check the IRS normalized data
check_data(all_irs, "IRS norm data")

Column Summaries (IRS norm data):
  TotInt_130C_Exp2 - tot: 1,421,516,138 med: 78,398 iqr: 266,598
  TotInt_131N_Exp2 - tot: 1,421,418,210 med: 79,947 iqr: 259,272
  TotInt_131C_Exp2 - tot: 1,419,533,583 med: 78,383 iqr: 263,997
  TotInt_130C_Exp4 - tot: 1,421,224,887 med: 79,653 iqr: 266,126
  TotInt_131N_Exp4 - tot: 1,421,548,850 med: 78,728 iqr: 260,471
  TotInt_131C_Exp4 - tot: 1,419,402,943 med: 79,139 iqr: 262,995
  TotInt_130C_Exp6 - tot: 1,422,358,502 med: 76,292 iqr: 251,794
  TotInt_131N_Exp6 - tot: 1,420,271,663 med: 78,765 iqr: 261,836
  TotInt_131C_Exp6 - tot: 1,420,680,130 med: 79,421 iqr: 262,838

Median CVs (IRS norm data):
  exp2_CV - med: 4.11 iqr: 4.77
  exp4_CV - med: 4.45 iqr: 5.26
  exp6_CV - med: 4.85 iqr: 6.04


# make big scatter plots
pairs.panels(log10(all_irs[extras]), lm = TRUE, main = "Extras - IRS norm data")


# make scatter plots across all 3 plexes
pairs.panels(log10(all_irs), lm = TRUE, main = "All 9 channels - IRS norm")


# form some log2 ratio vectors and pack into a data frame for TMM norm data
log2R_tmm <- data.frame(log2R = log2(pull(all_tmm[1]) / pull(all_tmm[4])), exp = "Exp2.4")
log2R_tmm <- rbind(log2R_tmm, data.frame(log2R = log2(pull(all_tmm[1]) / pull(all_tmm[7])), exp = "Exp2.6"))
log2R_tmm <- rbind(log2R_tmm, data.frame(log2R = log2(pull(all_tmm[4]) / pull(all_tmm[7])), exp = "Exp4.6"))

# get some of the base plot done
hplot <- ggplot(log2R_tmm, aes(x = log2R, color = exp, fill = exp)) +
  coord_cartesian(xlim = c(-3, 3)) +
  ggtitle("log2R distributions: Extra Channels - TMM")

# look at combined data and separated by plex
hplot + geom_histogram(binwidth=0.1, alpha = 0.4, position = "identity")
hplot + geom_histogram(binwidth=0.3, alpha = 0.4, position = "identity") + facet_wrap(~exp)


# make a log2 ratio data frame
log2R_irs <- data.frame(log2R = log2(pull(all_irs[1]) / pull(all_irs[4])), exp = "Exp2.4")
log2R_irs <- rbind(log2R_irs, data.frame(log2R = log2(pull(all_irs[1]) / pull(all_irs[7])), exp = "Exp2.6"))
log2R_irs <- rbind(log2R_irs, data.frame(log2R = log2(pull(all_irs[4]) / pull(all_irs[7])), exp = "Exp4.6"))

# some base parts of plot
hplot <- ggplot(log2R_irs, aes(x = log2R, color = exp, fill = exp)) +
  coord_cartesian(xlim = c(-3, 3)) +
  ggtitle("log2R distributions: Extra Channels - IRS")

# set a few more parameters and show plots
hplot + geom_histogram(binwidth=0.05, alpha = 0.4, position = "identity")
hplot + geom_histogram(binwidth=0.1, alpha = 0.4, position = "identity") + facet_wrap(~exp)


# expand the x-axis more
ggplot(log2R_irs, aes(x = log2R, color = exp, fill = exp)) +
  coord_cartesian(xlim = c(-1, 1)) +
  geom_histogram(binwidth=0.05, alpha = 0.4, position = "identity") +
  ggtitle("log2R distributions: Extra Channels - IRS")


# log session
sessionInfo()

R version 3.5.3 (2019-03-11)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS  10.16

Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] preprocessCore_1.44.0 robustbase_0.93-6     edgeR_3.24.3         
 [4] limma_3.38.3          psych_2.0.7           forcats_0.5.0        
 [7] stringr_1.4.0         dplyr_1.0.2           purrr_0.3.4          
[10] readr_1.3.1           tidyr_1.1.1           tibble_3.0.3         
[13] ggplot2_3.3.2         tidyverse_1.3.0      

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.5       locfit_1.5-9.4   lubridate_1.7.9  lattice_0.20-41 
 [5] assertthat_0.2.1 digest_0.6.25    IRdisplay_0.7.0  R6_2.4.1        
 [9] cellranger_1.1.0 repr_1.1.0       backports_1.1.8  reprex_0.3.0    
[13] evaluate_0.15    httr_1.4.2       pillar_1.4.6     rlang_1.0.2     
[17] uuid_0.1-4       readxl_1.3.1     rstudioapi_0.11  blob_1.2.1      
[21] labeling_0.3     munsell_0.5.0    broom_0.7.0      compiler_3.5.3  
[25] modelr_0.1.8     pkgconfig_2.0.3  base64enc_0.1-3  mnormt_1.5-6    
[29] htmltools_0.4.0  tidyselect_1.1.0 crayon_1.3.4     dbplyr_1.4.4    
[33] withr_2.2.0      grid_3.5.3       nlme_3.1-148     jsonlite_1.7.0  
[37] gtable_0.3.0     lifecycle_0.2.0  DBI_1.1.0        magrittr_1.5    
[41] scales_1.1.1     cli_3.3.0        stringi_1.4.6    farver_2.0.3    
[45] fs_1.5.0         xml2_1.3.2       ellipsis_0.3.2   generics_0.0.2  
[49] vctrs_0.4.1      IRkernel_1.1.1   tools_3.5.3      glue_1.6.2      
[53] DEoptimR_1.0-8   hms_0.5.3        parallel_3.5.3   colorspace_1.4-1
[57] rvest_0.3.6      pbdZMQ_0.3-3     haven_2.3.1

	group	lib.size	norm.factors
	<fct>	<dbl>	<dbl>
TotInt_130C_Exp2	1	1960333996	0.9891373
TotInt_131N_Exp2	1	1839411554	0.9976912
TotInt_131C_Exp2	1	1826650885	1.0007682
TotInt_130C_Exp4	1	1330669102	1.0567614
TotInt_131N_Exp4	1	1237179672	1.0568217
TotInt_131C_Exp4	1	1111150604	1.0543564
TotInt_130C_Exp6	1	1180655739	0.9345814
TotInt_131N_Exp6	1	1219129046	0.9584758
TotInt_131C_Exp6	1	1319485897	0.9599518

	avePoolExp2	avePoolExp4	avePoolExp6	geomean	fac2	fac4	fac6
	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
1	88241253	90379710	89002630	89203484	1.0109045	0.9869857	1.0022567
2	67426323	51506798	41830805	52569045	0.7796517	1.0206234	1.2567065
3	69321161	50007849	62043810	59914806	0.8643076	1.1981080	0.9656855
4	31291016	53874357	62539103	47240861	1.5097260	0.8768710	0.7553812
5	63555144	36366115	55747537	50507720	0.7947070	1.3888676	0.9060081
6	49918700	48813069	55155104	51222609	1.0261207	1.0493626	0.9287012

Thorough Testing of Internal Reference Scaling (IRS)¶

Phillip Wilmarth, OHSU¶

January 5, 2019¶

What and Why¶

This notebook will:¶

You will learn:¶

What is IRS?¶

The problem¶

Isobaric labeling¶

Need for reference channels¶

IRS method maintains the natural measurement scale¶

Testing and validating IRS¶

Data description¶

References¶

Load libraries¶

Load the TMT data¶

Data has been read in okay¶

Get the three reference columns from experiments 2, 4, and 6¶

How similar are the data from the 3 plexes?¶

There are some systematic differences between plexes¶

Single factor normalizations¶

Sample loading normalization (also known as library size corrections in next gen sequencing)¶

Check column totals and column medians¶

SL normalization summary¶

Median intensity normalization summary¶

We can also use TMM normalization from edgeR¶

TMM normalization summary¶

We can also try quantile normalization¶

Quantile normalization really does make things similar¶

See what the reference channels are like in a single plex¶

The plots all look the same!¶

Replicate data in single plexes are nearly identical¶

What about data between plexes?¶

Raw data needs normalization¶

See what effect normalization has¶

Median CV value improves to 28%¶

Either TMM or quantile normalization had similar effect¶

Raw data look TERRIBLE¶

Wait - all of the normalization methods are FAILING!¶

Boxplots and summary data looked great - what went wrong?¶

The difference really comes down to comparing channels from the same scan to channels from different scans¶

Channels are "tight" within plexes and "not so much" between plexes¶

IRS to the rescue!¶

IRS Steps:¶

IRS normalized data looks as good as other normalized data on global scales¶

IRS just fixed the extra channel data!¶

IRS does just one thing and does it well¶

Log ratios greater than about 2 (4-fold) would be needed for non-IRS data¶

IRS normalized data has a much narrower distribution of log2 ratios¶

Conclusions¶

Batch effects versus normalizations¶