Differential expression analysis with DESeq2 to determine divergence in expression level

library(DESeq2) # DESeq2 depends on IRanges, GenomicRanges,

## Loading required package: S4Vectors

## Loading required package: stats4

## Loading required package: BiocGenerics

## Loading required package: generics

## 
## Attaching package: 'generics'

## The following objects are masked from 'package:igraph':
## 
##     components, union

## The following object is masked from 'package:dplyr':
## 
##     explain

## The following objects are masked from 'package:base':
## 
##     as.difftime, as.factor, as.ordered, intersect,
##     is.element, setdiff, setequal, union

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:huxtable':
## 
##     width, width<-

## The following objects are masked from 'package:igraph':
## 
##     normalize, path

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame,
##     basename, cbind, colnames, dirname, do.call,
##     duplicated, eval, evalq, Filter, Find, get,
##     grep, grepl, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int,
##     pmin, pmin.int, Position, rank, rbind, Reduce,
##     rownames, sapply, saveRDS, table, tapply,
##     unique, unsplit, which.max, which.min

## 
## Attaching package: 'S4Vectors'

## The following objects are masked from 'package:Matrix':
## 
##     expand, unname

## The following objects are masked from 'package:data.table':
## 
##     first, second

## The following objects are masked from 'package:dplyr':
## 
##     first, rename

## The following object is masked from 'package:tidyr':
## 
##     expand

## The following object is masked from 'package:utils':
## 
##     findMatches

## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname

## Loading required package: IRanges

## 
## Attaching package: 'IRanges'

## The following object is masked from 'package:data.table':
## 
##     shift

## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice

## The following object is masked from 'package:purrr':
## 
##     reduce

## Loading required package: GenomicRanges

## Loading required package: GenomeInfoDb

## Loading required package: SummarizedExperiment

## Loading required package: MatrixGenerics

## 
## Attaching package: 'MatrixGenerics'

## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet,
##     colCollapse, colCounts, colCummaxs,
##     colCummins, colCumprods, colCumsums, colDiffs,
##     colIQRDiffs, colIQRs, colLogSumExps,
##     colMadDiffs, colMads, colMaxs, colMeans2,
##     colMedians, colMins, colOrderStats, colProds,
##     colQuantiles, colRanges, colRanks, colSdDiffs,
##     colSds, colSums2, colTabulates, colVarDiffs,
##     colVars, colWeightedMads, colWeightedMeans,
##     colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys,
##     rowAvgsPerColSet, rowCollapse, rowCounts,
##     rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs,
##     rowLogSumExps, rowMadDiffs, rowMads, rowMaxs,
##     rowMeans2, rowMedians, rowMins, rowOrderStats,
##     rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates,
##     rowVarDiffs, rowVars, rowWeightedMads,
##     rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars

## Loading required package: Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view
##     with 'browseVignettes()'. To cite
##     Bioconductor, see 'citation("Biobase")', and
##     for packages 'citation("pkgname")'.

## 
## Attaching package: 'Biobase'

## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians

## The following object is masked from 'package:huxtable':
## 
##     contents

## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians

## 
## Attaching package: 'DESeq2'

## The following object is masked _by_ '.GlobalEnv':
## 
##     collapseReplicates

# and S4Vectors, which include their own functions
# that share names with common tidyverse functions
# such as reduce and rename (which causes me problems)
# So loading it before tidyverse packages so that
# its functions get over-written by purrr::reduce and dplyr::rename
sapply(c("dplyr", "purrr", "tidyr", "ggpubr", "readr",
         "data.table", "ggplot2", "data.table",
         "matrixStats", "ggpubr"), require, character.only=TRUE)

##       dplyr       purrr       tidyr      ggpubr 
##        TRUE        TRUE        TRUE        TRUE 
##       readr  data.table     ggplot2  data.table 
##        TRUE        TRUE        TRUE        TRUE 
## matrixStats      ggpubr 
##        TRUE        TRUE

library(MASS, include.only = "glm.nb")
source("functions_for_figure_scripts.R")
load("data_files/GLM_Counts.RData")
ExperimentNames <- c("CC", "HAP4", "LowN", "LowPi", "Heat", "Cold")

Functions used in this script

# fits one model to detect significant allele effect on one gene from one environment-specific dataset
# @input: single gene df (counts and all columns of sample info) and name of gene in one experiment
# @output: model generated by glm.nb (or NA if any error or warning is thrown)
fitSingleGeneModel <- function(.df, .gene_name) {
  form <- formula(expr ~ time_point_num + allele)
  output <- tryCatch({glm.nb(form, .df, link = log, init.theta = 7)
  }, error = function(e) {
    return(NA)
  })
  return(output)
}

# Tests for fitSingleGeneModel
# re-run this block to your heart's desire
random_gene <- sample(rownames(spcts[[1]]), 1) # YOR222W is a good example of a gene with one outlier measurement that shouldn't be called as diverged
random_dataset_idx <- sample(c(1:length(ExperimentNames)), 1)
random_cts <- spcts[[random_dataset_idx]]
random_info <- spinfo[[random_dataset_idx]]
test_genedf <- bind_cols(tibble(expr = random_cts[random_gene,],
                                gene_name = random_gene,
                                allele_or_species = "species"),
                         random_info)
spmod <- fitSingleGeneModel(test_genedf, random_gene)
summary(spmod)

## 
## Call:
## glm.nb(formula = form, data = .df, init.theta = 9.78619102, link = log)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)
## (Intercept)     8.922e+00  3.635e-02 245.450  < 2e-16
## time_point_num -4.014e-04  5.281e-05  -7.601 2.93e-14
## allelecer       4.300e-03  4.635e-02   0.093    0.926
##                   
## (Intercept)    ***
## time_point_num ***
## allelecer         
## ---
## Signif. codes:  
## 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(9.7862) family taken to be 1)
## 
##     Null deviance: 249.25  on 190  degrees of freedom
## Residual deviance: 194.26  on 188  degrees of freedom
## AIC: 3458.8
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  9.786 
##           Std. Err.:  0.986 
## 
##  2 x log-likelihood:  -3450.837

# note that the default residuals from glm.nb are working residuals, less intuitive than response residuals (actual - fitted value)
spmod$residuals[1]

##          1 
## -0.2229863

resid(spmod, type = "working")[1]

##          1 
## -0.2229863

resid(spmod, type = "response")[1]

##         1 
## -1638.648

test_genedf$expr[1] - spmod$fitted.values[1]

## WT_TP2_cer_C1_G6_G12_GS1 
##                -1638.648

# given dataframe of all gene's counts, plots Scer expression vs Spar expression 
# for average of replicates at each timepoint (or moving average for LowPi and HAP4)
plotSingleGeneModelExample <- function(.genedf, .gene_name, .fitted_vals = FALSE, .log2 = TRUE,
                                       .size = 3) {
  .mod <- fitSingleGeneModel(.genedf, random_gene)
  sprow <- summary(.mod)$coefficients["allelecer",, drop = FALSE]
  fold_change <- exp(sprow[1])
  # plotting (averaging each timepoint down to a single expr value per species)
  # if no replicates to average, taking a sliding window of 5
  genedf_collapsed <- group_by(.genedf, time_point_num, allele) |> 
    summarise(expr = mean(expr)) |> ungroup()
  if (unique(.genedf$experiment) %in% c("LowPi", "HAP4")) {
    cat("no replicates for this experiment, getting moving average\n")
    genedf_cer <- genedf_collapsed |> filter(allele == "cer") |> 
      arrange(time_point_num)
    genedf_par <- genedf_collapsed |> filter(allele == "par") |> 
      arrange(time_point_num)
    cts_cer <- matrix(as.numeric(genedf_cer$expr), 
                      nrow = 1, ncol = nrow(genedf_cer))
    colnames(cts_cer) <- as.character(genedf_cer$time_point_num)
    cts_par <- matrix(as.numeric(genedf_par$expr), 
                      nrow = 1, ncol = nrow(genedf_par))
    colnames(cts_par) <- as.character(genedf_par$time_point_num)
    movavg_cer <- getMovingAverage(cts_cer)
    movavg_par <- getMovingAverage(cts_par)
    movavg <- bind_rows(tibble(expr = as.numeric(movavg_cer),
                               allele = "cer",
                               time_point_num = as.numeric(colnames(movavg_cer))),
                        tibble(expr = as.numeric(movavg_par),
                               allele = "par",
                               time_point_num = as.numeric(colnames(movavg_par))))
    genedf_collapsed <- movavg
  }
  plotdf <- .genedf %>% mutate(fittedvals = .mod$fitted.values) %>% 
    select(allele, time_point_num, fittedvals) %>% 
    unique() |> 
    left_join(y = genedf_collapsed, by = c("time_point_num", "allele")) |> 
    pivot_longer(cols = c(expr, fittedvals)) %>% 
    pivot_wider(id_cols = c(time_point_num, name), 
                names_from = allele, values_from = value)
  if (!.log2) {
    slope <- 1/fold_change # inverse b/c Scer is conventionally put on the x axis but also conventionally has positive lfc
    norm_func <- identity
    units_text <- "(cpm)"
  }
  if (.log2) { # technically the model is a subtly curved line now,
    # dx/dy = log(x, base = y),
    # but as the fitted values tend to be bunched in one place,
    # we can summarise them as a line
    slope <- map2(plotdf$cer[plotdf$name == "fittedvals"], 
                  plotdf$par[plotdf$name == "fittedvals"], \(x, y) {
      return(log(y, base = x))
    }) |> unlist() |> mean()
    norm_func <- log2
    units_text <- "(log2(cpm))"
  }
  max_expr <-  filter(plotdf, name != "fittedvals") |> select(cer, par) |> 
    unlist() |> as.numeric() |> max(na.rm = TRUE) |> norm_func()
  min_expr <-  filter(plotdf, name != "fittedvals") |> select(cer, par) |> 
    unlist() |> as.numeric() |> min(na.rm = TRUE) |> norm_func()
  cat("min and max expression:", max_expr, min_expr, "\n")
  if (.fitted_vals) {
    p <- ggplot(plotdf, aes(x = norm_func(cer), y = norm_func(par))) + 
      geom_point(aes(color = name), size = .size) +
      geom_abline(color = "gold", slope = slope, intercept = 0) + 
      geom_abline(color = "navy", slope = 1, intercept = 0) + 
      ggtitle(paste("slope = ", round(slope, 2), 
                    "\nl2fc = ", round(log2(fold_change), 2),
                    "\npval = ", round(sprow[4] + 5*10^(-5), 4))) + # this is essentially ceiling(sprow[4], digits = 4) except ceiling doesn't allow decimals
      xlim(c(min_expr, max_expr)) + 
      ylim(c(min_expr, max_expr)) + 
      xlab(paste("Scer expression", units_text)) +
      ylab(paste("Spar expression", units_text)) +
      theme_classic()
  }
  if (!.fitted_vals) {
    # filtering down to 11 tps for scale_color_brewer (ever other one then last 8)
    ntps <- length(unique(plotdf$time_point_num))
    if (ntps > 11) {
      good_tps <- unique(plotdf$time_point_num)[order(unique(plotdf$time_point_num))][c(1, 3, 5, (ntps-7):ntps)]
      plotdf <- filter(plotdf, time_point_num %in% good_tps)
    }
    plotdf$time_point_num <- as.factor(plotdf$time_point_num)
    p <- ggplot(filter(plotdf, name != "fittedvals"), aes(x = norm_func(cer), y = norm_func(par))) + 
      geom_point(aes(color = time_point_num), size = .size) +
      scale_color_brewer(type = "div",
                         palette = "RdYlGn",
                         direction = 1) +
      geom_abline(color = "gold", slope = slope, intercept = 0) + 
      geom_abline(color = "navy", slope = 1, intercept = 0) + 
      ggtitle(paste("slope = ", round(slope, 2), 
                    "\nl2fc = ", round(log2(fold_change), 2),
                    "\npval = ", round(sprow[4] + 5*10^(-5), 4))) + # this is essentially ceiling(sprow[4], digits = 4) except ceiling doesn't allow decimals
      xlim(c(min_expr, max_expr)) + 
      ylim(c(min_expr, max_expr)) + 
      xlab(paste("Scer expression", units_text)) +
      ylab(paste("Spar expression", units_text)) +
      theme_classic()
  }
  return(p)
}
plotSingleGeneModelExample(test_genedf, random_gene, 
                           .log2 = TRUE, .fitted_vals = TRUE)

## `summarise()` has grouped output by 'time_point_num'.
## You can override using the `.groups` argument.

## min and max expression: 13.1152 11.88595

plotSingleGeneModelExample(test_genedf, random_gene, 
                           .log2 = FALSE, .fitted_vals = FALSE)

## `summarise()` has grouped output by 'time_point_num'.
## You can override using the `.groups` argument.

## min and max expression: 8872.971 3784.677

Plotting example genes for figure panel

# plot of example gene diverging in level
gene_idx <- "YBR067C" # decided in Saturated Growth script
test_genedf <- bind_cols(tibble(expr = spcts$HAP4[gene_idx,],
                                gene_name = gene_idx,
                                allele_or_species = "species"),
                         spinfo$HAP4)
p <- plotSingleGeneModelExample(test_genedf, gene_idx, .log2 = FALSE) +
  theme(legend.position = "none")

## `summarise()` has grouped output by 'time_point_num'.
## You can override using the `.groups` argument.

## no replicates for this experiment, getting moving average
## min and max expression: 2213 38.6

## Warning: Removed 1 row containing missing values or values
## outside the scale range (`geom_point()`).

pdf("paper_figures/ExperimentOverview/level_lfc.pdf",
    width = 2.1, height = 2.5)
p

## Warning: Removed 1 row containing missing values or values
## outside the scale range (`geom_point()`).

dev.off()

## quartz_off_screen 
##                 2

# plot of example gene diverging in dynamics, showing high noise but going in different directions at different timepoints
gene_idx <- "YJR001W" # decided in Saturated Growth script
test_genedf <- bind_cols(tibble(expr = spcts$HAP4[gene_idx,],
                                gene_name = gene_idx,
                                allele_or_species = "species"),
                         spinfo$HAP4)
p <- plotSingleGeneModelExample(test_genedf, gene_idx, .log2 = FALSE) +
  theme(legend.position = "none")

## `summarise()` has grouped output by 'time_point_num'.
## You can override using the `.groups` argument.

## no replicates for this experiment, getting moving average
## min and max expression: 163.6 45.33333

## Warning: Removed 1 row containing missing values or values
## outside the scale range (`geom_point()`).

pdf("paper_figures/ExperimentOverview/dynamics_lfc.pdf",
    width = 2.1, height = 2.5)
p

## Warning: Removed 1 row containing missing values or values
## outside the scale range (`geom_point()`).

dev.off()

## quartz_off_screen 
##                 2

fitDatasetModels <- function(.cts, .info) {
  output <- map(rownames(.cts), function(g) {
    #cat(which(rownames(.cts) == g), "/", nrow(.cts), "\n")
    #cat("currently processing", g, "\n")
    gdf <- bind_cols(tibble(expr = .cts[g,]),
                     .info)
    mod <- fitSingleGeneModel(gdf, g)
    return(mod)
  })
  return(output)
}
# test for fitDatasetModels (just taking the first 10 genes from a random dataset)
random_dataset_idx <- sample(c(1:length(ExperimentNames)), 1)
test <- fitDatasetModels(spcts[[random_dataset_idx]][1:10,], spinfo[[random_dataset_idx]]) # if it runs, it passes this test

# convert one model into one row of a dataframe summarizing that model's output (gene name, name of coefficient, effect size of coefficient, pvalue for wald test of that coefficient)
# also changes lfc to l2fc using change of base: log2(fc) = log(fc)/log(2)
parseDfRowFromModel <- function(.mod, .gene_name) {
  #cat("currently processing", .gene_name, "\n")
  allelerow <- summary(.mod)$coefficients["allelecer",, drop = FALSE]
  result <- tibble(gene_name = .gene_name,
                   coefficient = gsub("allele", "", rownames(allelerow)), # should always have the non-reference level (cer if par is set to reference), but this just helps us be more sure
                   effect_size = as.numeric(allelerow[,"Estimate"])/log(2),
                   pvalue = as.numeric(allelerow[,"Pr(>|z|)"]))
  return(result)
}
# test for parseDfRowFromModel
random_dataset_idx <- sample(c(1:length(ExperimentNames)), 1)
random_modslist <- fitDatasetModels(spcts[[random_dataset_idx]][1:10,], spinfo[[random_dataset_idx]]) # same test from fitDatasetModels, we need the output for this test
gene_idx <- sample(c(1:10), 1)
parseDfRowFromModel(random_modslist[[gene_idx]], gene_idx)

gene_name	coefficient	effect_size	pvalue
9	cer	0.37	0.179

parseDfFromModslist <- function(.modslist) {
  good_idxs <- which(!is.na(.modslist))
  result <- map2(.modslist[good_idxs], names(.modslist)[good_idxs], function(mod, gname) {
    return(parseDfRowFromModel(mod, gname))
  }) |> purrr::reduce(.f = bind_rows)
  return(result)
}
# test for parseDfFromModslist
random_dataset_idx <- sample(c(1:length(ExperimentNames)), 1)
random_modslist <- fitDatasetModels(spcts[[random_dataset_idx]][1:10,], spinfo[[random_dataset_idx]]) # same test from fitDatasetModels, we need the output for this test
names(random_modslist) <- c(1:10)
test <- parseDfFromModslist(random_modslist)
test

gene_name	coefficient	effect_size	pvalue
1	cer	0.13	0.409
2	cer	-0.394	0.0516
3	cer	0.0573	0.777
4	cer	0.324	0.0722
5	cer	0.975	0.000115
6	cer	0.401	0.00904
7	cer	0.543	1.47e-07
8	cer	0.123	0.65
9	cer	1.22	3.13e-11
10	cer	0.796	1.58e-15

Full model fitting

# applying to each experiment (takes ~1-2 min per dataset)
spmods <- map2(spcts, spinfo, fitDatasetModels)
spmods <- map2(spmods, spcts, function(x, y) {
  names(x) <- rownames(y)
  return(x)
})

# applying parseDfFromModslist to spmods
spdfs <- lapply(spmods, parseDfFromModslist)

# adding experiment name column so the 4 datasets can be combined
spdfs <- map2(spdfs, names(spdfs), function(x, y) {
  x$experiment <- y
  return(x)
})
spdf <- purrr::reduce(.f = bind_rows, spdfs)

# renaming coefficient so that the species models can be added to allele models in aldf below
spdf$coefficient <- "species"

# applying to each experiment (takes ~1-2 min per dataset)
almods <- map2(alcts, alinfo, fitDatasetModels)
almods <- map2(almods, alcts, function(x, y) {
  names(x) <- rownames(y)
  return(x)
})

# applying parseDfFromModslist and parseAvgCompsFromModslist to spmods
aldfs <- lapply(almods, parseDfFromModslist)

# cleaning some column values so aldfs and alavgs can be combined into one df
# adding experiment name column
aldfs <- map2(aldfs, names(aldfs), function(x, y) {
  x$experiment <- y
  return(x)
})

aldf <- purrr::reduce(.f = bind_rows, aldfs)
# renaming coefficient
aldf$coefficient <- "allele"

spaldf <- full_join(aldf, spdf, by = c("gene_name", "experiment", "coefficient", "effect_size", "pvalue"))

# slightly different nGenes in each experiment b/c different genes were filtered out by low expression in the data script
spaldf %>% filter(coefficient == "species") %>% select(experiment) %>% table()

## experiment
##    CC  Cold  HAP4  Heat  LowN LowPi 
##  3960  4052  4290  4172  4412  4386

spaldf %>% filter(coefficient == "allele") %>% select(experiment) %>% table()

## experiment
##    CC  Cold  HAP4  Heat  LowN LowPi 
##  3960  4052  4289  4172  4412  4387

# creating genedf, where each row is one gene
# (this isn't used much in the final code version, but it is used in a few places, so keeping it for continuity)
genedf <-  spaldf |> 
  pivot_wider(names_from = c("experiment", "coefficient"), values_from = c("effect_size", "pvalue"), id_cols = c("gene_name"))
table(genedf$gene_name) |> table() # checking that every gene is only represented once in genedf

## 
##    1 
## 4863

Saving

There are still some QC/exploration plots below, but none of it affects the output of this script, so we are exporting models here:

save(spcts, spinfo, alcts, alinfo, spaldf, genedf, file = "data_files/single_gene_models.RData")
load(file = "data_files/single_gene_models.RData")

Dataframe cleaning and QC

# Valuable functions for examining individual genes
getExprVector <- function(.gene_name, .organism = "cer", 
                          .experiment = c("LowN", "CC", "HAP4", "LowPi")) {
  if (.organism %in% c("cer", "par")) {
    common_idxs <- purrr::reduce(map(spcts[.experiment], rownames), .f = intersect)
    if(!.gene_name %in% common_idxs) {
      return(NULL)
    }
    info <- spinfo[.experiment] |> purrr::reduce(.f = bind_rows) # allows for multiple experiments
    cts <- map(spcts[.experiment], \(x) x[common_idxs,]) |> purrr::reduce(.f = cbind)
    expr <- cts[.gene_name, info$organism == .organism]
  }
  if (.organism == "hyb") {
    common_idxs <- purrr::reduce(map(alcts[.experiment], rownames), .f = intersect)
    if(!.gene_name %in% common_idxs) {
      return(NULL)
    }
    info <- alinfo[.experiment] |> purrr::reduce(.f = bind_rows)
    cts <- map(alcts[.experiment], \(x) x[common_idxs,]) |> purrr::reduce(.f = cbind)
    expr_hyc <- cts[.gene_name, info$organism == .organism & info$allele == "cer"]
    expr_hyp <- cts[.gene_name, info$organism == .organism & info$allele == "par"]
    expr <- expr_hyc + expr_hyp
  }
  if (.organism == "hyc") {
    common_idxs <- purrr::reduce(map(alcts[.experiment], rownames), .f = intersect)
    if(!.gene_name %in% common_idxs) {
      return(NULL)
    }
    info <- alinfo[.experiment] |> purrr::reduce(.f = bind_rows)
    cts <- map(alcts[.experiment], \(x) x[common_idxs,]) |> purrr::reduce(.f = cbind)
    expr <- cts[.gene_name, info$organism == "hyb" & info$allele == "cer"]
  }
  if (.organism == "hyp") {
    common_idxs <- purrr::reduce(map(alcts[.experiment], rownames), .f = intersect)
    if(!.gene_name %in% common_idxs) {
      return(NULL)
    }
    info <- alinfo[.experiment] |> purrr::reduce(.f = bind_rows)
    cts <- map(alcts[.experiment], \(x) x[common_idxs,]) |> purrr::reduce(.f = cbind)
    expr <- cts[.gene_name, info$organism == "hyb" & info$allele == "par"]
  }
  return(expr)
}
# tests for getExprVector (use .experiment = ExperimentNames to get counts from all 4 experiments)
# this commented-out line should hit an error if run (TDH3 isn't in Heat/Cold):
# cer_expr <- getExprVector(.gene_name = "YGR192C", .organism = "cer", .experiment = ExperimentNames)
# should work:
cer_expr <- getExprVector(.gene_name = "YGR192C", .organism = "cer", .experiment = c("CC", "HAP4", "LowN", "LowPi"))
par_expr <- getExprVector(.gene_name = "YGR192C", .organism = "par", .experiment = c("CC", "HAP4", "LowN", "LowPi"))
hyc_expr <- getExprVector(.gene_name = "YGR192C", .organism = "hyc", .experiment = c("CC", "HAP4", "LowN", "LowPi"))
hyp_expr <- getExprVector(.gene_name = "YGR192C", .organism = "hyp", .experiment = c("CC", "HAP4", "LowN", "LowPi"))
tibble(organism = c("cer", "par", "hyc", "hyp"),
       variance = map_dbl(list(cer_expr, par_expr, hyc_expr, hyp_expr), var, na.rm = TRUE),
       mean = map_dbl(list(cer_expr, par_expr, hyc_expr, hyp_expr), mean, na.rm = TRUE),
       median = map_dbl(list(cer_expr, par_expr, hyc_expr, hyp_expr), median, na.rm = TRUE))

organism	variance	mean	median
cer	5.94e+07	2.13e+04	2.03e+04
par	4.03e+06	4.12e+03	4.01e+03
hyc	5.77e+07	1.86e+04	1.78e+04
hyp	2.65e+06	4.71e+03	4.66e+03

getGeneDf <- function(.gene_name, .mode = c("parents", "hybrid"), 
                      .experiment = c("LowN", "CC", "HAP4", "LowPi")) {
  if (.mode == "parents") {
    info_cer <- spinfo[.experiment] |> purrr::reduce(.f = bind_rows) |> filter(allele == "cer")
    info_par <- spinfo[.experiment] |> purrr::reduce(.f = bind_rows) |> filter(allele == "par")
    cts_cer <- getExprVector(.gene_name, .experiment = .experiment, .organism = "cer")
    cts_par <- getExprVector(.gene_name, .experiment = .experiment, .organism = "par")
  }
  if (.mode == "hybrid") {
    info_cer <- alinfo[.experiment] |> purrr::reduce(.f = bind_rows) |> filter(allele == "cer")
    info_par <- alinfo[.experiment] |> purrr::reduce(.f = bind_rows) |> filter(allele == "par")
    cts_cer <- getExprVector(.gene_name, .experiment = .experiment, .organism = "hyc")
    cts_par <- getExprVector(.gene_name, .experiment = .experiment, .organism = "hyp")
  }
  if (is.null(cts_cer) | is.null(cts_par)) {
    return(NULL)
  }
  genedf <- bind_rows(bind_cols(tibble(expr = cts_cer), 
                                select(info_cer, experiment, time_point_num, allele)), 
                      bind_cols(tibble(expr = cts_par), 
                                select(info_par, experiment, time_point_num, allele))) |> 
    summarise(expr = mean(expr), .by = c("experiment", "time_point_num", "allele")) |> 
    pivot_wider(id_cols = c("experiment", "time_point_num"),
                names_from = allele, values_from = expr)
  return(drop_na(genedf))
}
# tests for getGeneDf
test <- NULL
while (is.null(test)) {
  gene_idx <- sample(rownames(spcts[[1]]), 1)
  test <- getGeneDf(gene_idx, .mode = "parents", .experiment = c("CC", "HAP4", "LowN", "LowPi"))
}
ggplot(test, aes(x = cer, y = par)) + geom_point(aes(color = experiment)) + ggtitle(gene_idx) + geom_abline(color = "gold")

test <- getGeneDf(gene_idx, .mode = "hybrid", .experiment = c("CC", "HAP4", "LowN", "LowPi")) # hybrid should be less variable because cer and par measurements were collected in the same cell
ggplot(test, aes(x = cer, y = par)) + geom_point(aes(color = experiment)) + ggtitle(gene_idx) + geom_abline(color = "gold")

spaldf %>% filter(gene_name == gene_idx)

gene_name	coefficient	effect_size	pvalue	experiment
YGL161C	allele	0.216	0.0115	LowN
YGL161C	allele	0.481	0.0351	CC
YGL161C	allele	0.403	0.117	HAP4
YGL161C	allele	0.692	1.67e-08	LowPi
YGL161C	allele	0.378	0.151	Heat
YGL161C	allele	0.451	0.00669	Cold
YGL161C	species	0.598	1.72e-12	LowN
YGL161C	species	0.457	0.000143	CC
YGL161C	species	0.273	0.325	HAP4
YGL161C	species	0.923	2.16e-06	LowPi
YGL161C	species	0.615	7.56e-05	Heat
YGL161C	species	0.846	6.58e-08	Cold

# Visualizing expression variability and divergence in...
# A) boxplots (divergence readily apparent, variability hardly apparent) 
# B) scatterplots (variability more apparent, divergence also apparent, so might just stick to scatterplots alone)
visualizeGeneExpressionBoxplot <- function(.name, .experiment = c("CC", "HAP4", "LowN", "LowPi")) {
  aldf <- getGeneDf(.name, "hybrid", .experiment = .experiment)
  spdf <- getGeneDf(.name, "parents", .experiment = .experiment)
  aldf$allele_or_species <- "allele"
  spdf$allele_or_species <- "species"
  plotdf <- bind_rows(aldf, spdf) %>% pivot_longer(cols = c("cer", "par"), names_to = "allele", values_to = "expr")
  output <- ggplot(plotdf, aes(x = paste(allele_or_species, allele), y = log(expr))) + 
    geom_boxplot(aes(fill = paste(allele_or_species, allele))) + 
    ggtitle(.name) + xlab(element_blank()) + ylab("log(expression level)") + 
    scale_x_discrete(limits = c("species cer", "species par", "allele cer", "allele par")) + # this switches the order
    theme_classic() + scale_fill_manual(breaks = c("species cer", "species par", "allele cer", "allele par"), # this sets the colors to match (yes you need breaks even though you've just specified the order above)
                                        values = c("gold", "mediumorchid", "lemonchiffon", "plum")) +
    theme(legend.position = "none")
  return(output)
}
visualizeGeneExpressionBoxplot("YGR192C")

visualizeGeneExpressionScatterplot <- function(.gdf, .plotname = "individual gene counts", .color_from = "experiment", .log = FALSE) {
  if (!.log) {
    max_expr <- max(c(.gdf$cer, .gdf$par), na.rm = TRUE)
    output <- ggplot(.gdf, aes(x = cer, y = par)) + 
      geom_point(aes(color = pull(.gdf[, as.character(.color_from)]))) + geom_abline(color = "gold") +
      ggtitle(.plotname) + xlab("expression level in cerevisiae") + ylab("expression level in paradoxus") + 
      theme_classic() + xlim(c(0, max_expr)) + ylim(c(0, max_expr)) + theme(legend.title = element_blank())
    return(output)
  }
  if (.log) {
    max_expr <- max(c(log(.gdf$cer), log(.gdf$par)), na.rm = TRUE)
    output <- ggplot(.gdf, aes(x = log(cer), y = log(par))) + 
      geom_point(aes(color = pull(.gdf[, as.character(.color_from)]))) + geom_abline(color = "gold") +
      ggtitle(.plotname) + xlab("expression level in cerevisiae") + ylab("expression level in paradoxus") + 
      theme_classic() + xlim(c(0, max_expr)) + ylim(c(0, max_expr)) + theme(legend.title = element_blank())
    return(output)
  }
}
# TDH3
getGeneDf("YGR192C", .mode = "parents") %>% visualizeGeneExpressionScatterplot(.plotname = "YGR192C - parents")

getGeneDf("YGR192C", .mode = "hybrid") %>% visualizeGeneExpressionScatterplot(.plotname = "YGR192C - hybrid")

getGeneDf("YGR192C", .mode = "parents") %>% visualizeGeneExpressionScatterplot(.plotname = "YGR192C - parents - log scale", .log = TRUE)

# random gene
test <- NULL
test_hyb <- NULL
while (is.null(test) | is.null(test_hyb)) {
  gene_idx <- sample(rownames(spcts[[1]]), 1)
  test <- getGeneDf(gene_idx, .mode = "parents", .experiment = c("CC", "HAP4", "LowN", "LowPi"))
  test_hyb <- getGeneDf(gene_idx, .mode = "hybrid", .experiment = c("CC", "HAP4", "LowN", "LowPi"))
}
getGeneDf(gene_idx, .mode = "parents") %>% visualizeGeneExpressionScatterplot(.plotname = paste(gene_idx, "parents", sep = " - "))

getGeneDf(gene_idx, .mode = "hybrid") %>% visualizeGeneExpressionScatterplot(.plotname = paste(gene_idx, "hybrid", sep = " - "))

# QC: picking threshold for calling sig diverged between cer and par
# Goal: We know there's no true biological threshold for when a gene is or is not diverged,
# but we want to draw a threshold beyond which we are fairly certain that the gene is truly diverged
# and use genes beyond that threshold as a SAMPLE of diverged genes to ask if there's anything unique about them
p_thresh <- 0.05/(nrow(spcts[[1]])*6) # correcting for multiple tests: 6 experiments + the full model
# eff_thresh <- 1 

# Volcano plot
library(ggExtra)
ggMarginal(ggplot(spaldf, aes(x = effect_size, y = -log(pvalue))) + 
  geom_point(aes(color = pvalue < p_thresh)) +
  theme(legend.position = "none") + geom_vline(xintercept = c(-1, 1), color = "gold"),
  margins = "both", groupColour = TRUE, groupFill = TRUE)

## Warning: Removed 10 rows containing non-finite outside the scale
## range (`stat_density()`).
## Removed 10 rows containing non-finite outside the scale
## range (`stat_density()`).