library(DESeq2) # DESeq2 depends on IRanges, GenomicRanges,
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: generics
##
## Attaching package: 'generics'
## The following objects are masked from 'package:igraph':
##
## components, union
## The following object is masked from 'package:dplyr':
##
## explain
## The following objects are masked from 'package:base':
##
## as.difftime, as.factor, as.ordered, intersect,
## is.element, setdiff, setequal, union
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:huxtable':
##
## width, width<-
## The following objects are masked from 'package:igraph':
##
## normalize, path
## The following object is masked from 'package:dplyr':
##
## combine
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame,
## basename, cbind, colnames, dirname, do.call,
## duplicated, eval, evalq, Filter, Find, get,
## grep, grepl, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int,
## pmin, pmin.int, Position, rank, rbind, Reduce,
## rownames, sapply, saveRDS, table, tapply,
## unique, unsplit, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:Matrix':
##
## expand, unname
## The following objects are masked from 'package:data.table':
##
## first, second
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:tidyr':
##
## expand
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:data.table':
##
## shift
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## The following object is masked from 'package:purrr':
##
## reduce
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet,
## colCollapse, colCounts, colCummaxs,
## colCummins, colCumprods, colCumsums, colDiffs,
## colIQRDiffs, colIQRs, colLogSumExps,
## colMadDiffs, colMads, colMaxs, colMeans2,
## colMedians, colMins, colOrderStats, colProds,
## colQuantiles, colRanges, colRanks, colSdDiffs,
## colSds, colSums2, colTabulates, colVarDiffs,
## colVars, colWeightedMads, colWeightedMeans,
## colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys,
## rowAvgsPerColSet, rowCollapse, rowCounts,
## rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs,
## rowLogSumExps, rowMadDiffs, rowMads, rowMaxs,
## rowMeans2, rowMedians, rowMins, rowOrderStats,
## rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates,
## rowVarDiffs, rowVars, rowWeightedMads,
## rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view
## with 'browseVignettes()'. To cite
## Bioconductor, see 'citation("Biobase")', and
## for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following object is masked from 'package:huxtable':
##
## contents
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
##
## Attaching package: 'DESeq2'
## The following object is masked _by_ '.GlobalEnv':
##
## collapseReplicates
# and S4Vectors, which include their own functions
# that share names with common tidyverse functions
# such as reduce and rename (which causes me problems)
# So loading it before tidyverse packages so that
# its functions get over-written by purrr::reduce and dplyr::rename
sapply(c("dplyr", "purrr", "tidyr", "ggpubr", "readr",
"data.table", "ggplot2", "data.table",
"matrixStats", "ggpubr"), require, character.only=TRUE)
## dplyr purrr tidyr ggpubr
## TRUE TRUE TRUE TRUE
## readr data.table ggplot2 data.table
## TRUE TRUE TRUE TRUE
## matrixStats ggpubr
## TRUE TRUE
library(MASS, include.only = "glm.nb")
source("functions_for_figure_scripts.R")
load("data_files/GLM_Counts.RData")
ExperimentNames <- c("CC", "HAP4", "LowN", "LowPi", "Heat", "Cold")
# fits one model to detect significant allele effect on one gene from one environment-specific dataset
# @input: single gene df (counts and all columns of sample info) and name of gene in one experiment
# @output: model generated by glm.nb (or NA if any error or warning is thrown)
fitSingleGeneModel <- function(.df, .gene_name) {
form <- formula(expr ~ time_point_num + allele)
output <- tryCatch({glm.nb(form, .df, link = log, init.theta = 7)
}, error = function(e) {
return(NA)
})
return(output)
}
# Tests for fitSingleGeneModel
# re-run this block to your heart's desire
random_gene <- sample(rownames(spcts[[1]]), 1) # YOR222W is a good example of a gene with one outlier measurement that shouldn't be called as diverged
random_dataset_idx <- sample(c(1:length(ExperimentNames)), 1)
random_cts <- spcts[[random_dataset_idx]]
random_info <- spinfo[[random_dataset_idx]]
test_genedf <- bind_cols(tibble(expr = random_cts[random_gene,],
gene_name = random_gene,
allele_or_species = "species"),
random_info)
spmod <- fitSingleGeneModel(test_genedf, random_gene)
summary(spmod)
##
## Call:
## glm.nb(formula = form, data = .df, init.theta = 9.78619102, link = log)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 8.922e+00 3.635e-02 245.450 < 2e-16
## time_point_num -4.014e-04 5.281e-05 -7.601 2.93e-14
## allelecer 4.300e-03 4.635e-02 0.093 0.926
##
## (Intercept) ***
## time_point_num ***
## allelecer
## ---
## Signif. codes:
## 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(9.7862) family taken to be 1)
##
## Null deviance: 249.25 on 190 degrees of freedom
## Residual deviance: 194.26 on 188 degrees of freedom
## AIC: 3458.8
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 9.786
## Std. Err.: 0.986
##
## 2 x log-likelihood: -3450.837
# note that the default residuals from glm.nb are working residuals, less intuitive than response residuals (actual - fitted value)
spmod$residuals[1]
## 1
## -0.2229863
resid(spmod, type = "working")[1]
## 1
## -0.2229863
resid(spmod, type = "response")[1]
## 1
## -1638.648
test_genedf$expr[1] - spmod$fitted.values[1]
## WT_TP2_cer_C1_G6_G12_GS1
## -1638.648
# given dataframe of all gene's counts, plots Scer expression vs Spar expression
# for average of replicates at each timepoint (or moving average for LowPi and HAP4)
plotSingleGeneModelExample <- function(.genedf, .gene_name, .fitted_vals = FALSE, .log2 = TRUE,
.size = 3) {
.mod <- fitSingleGeneModel(.genedf, random_gene)
sprow <- summary(.mod)$coefficients["allelecer",, drop = FALSE]
fold_change <- exp(sprow[1])
# plotting (averaging each timepoint down to a single expr value per species)
# if no replicates to average, taking a sliding window of 5
genedf_collapsed <- group_by(.genedf, time_point_num, allele) |>
summarise(expr = mean(expr)) |> ungroup()
if (unique(.genedf$experiment) %in% c("LowPi", "HAP4")) {
cat("no replicates for this experiment, getting moving average\n")
genedf_cer <- genedf_collapsed |> filter(allele == "cer") |>
arrange(time_point_num)
genedf_par <- genedf_collapsed |> filter(allele == "par") |>
arrange(time_point_num)
cts_cer <- matrix(as.numeric(genedf_cer$expr),
nrow = 1, ncol = nrow(genedf_cer))
colnames(cts_cer) <- as.character(genedf_cer$time_point_num)
cts_par <- matrix(as.numeric(genedf_par$expr),
nrow = 1, ncol = nrow(genedf_par))
colnames(cts_par) <- as.character(genedf_par$time_point_num)
movavg_cer <- getMovingAverage(cts_cer)
movavg_par <- getMovingAverage(cts_par)
movavg <- bind_rows(tibble(expr = as.numeric(movavg_cer),
allele = "cer",
time_point_num = as.numeric(colnames(movavg_cer))),
tibble(expr = as.numeric(movavg_par),
allele = "par",
time_point_num = as.numeric(colnames(movavg_par))))
genedf_collapsed <- movavg
}
plotdf <- .genedf %>% mutate(fittedvals = .mod$fitted.values) %>%
select(allele, time_point_num, fittedvals) %>%
unique() |>
left_join(y = genedf_collapsed, by = c("time_point_num", "allele")) |>
pivot_longer(cols = c(expr, fittedvals)) %>%
pivot_wider(id_cols = c(time_point_num, name),
names_from = allele, values_from = value)
if (!.log2) {
slope <- 1/fold_change # inverse b/c Scer is conventionally put on the x axis but also conventionally has positive lfc
norm_func <- identity
units_text <- "(cpm)"
}
if (.log2) { # technically the model is a subtly curved line now,
# dx/dy = log(x, base = y),
# but as the fitted values tend to be bunched in one place,
# we can summarise them as a line
slope <- map2(plotdf$cer[plotdf$name == "fittedvals"],
plotdf$par[plotdf$name == "fittedvals"], \(x, y) {
return(log(y, base = x))
}) |> unlist() |> mean()
norm_func <- log2
units_text <- "(log2(cpm))"
}
max_expr <- filter(plotdf, name != "fittedvals") |> select(cer, par) |>
unlist() |> as.numeric() |> max(na.rm = TRUE) |> norm_func()
min_expr <- filter(plotdf, name != "fittedvals") |> select(cer, par) |>
unlist() |> as.numeric() |> min(na.rm = TRUE) |> norm_func()
cat("min and max expression:", max_expr, min_expr, "\n")
if (.fitted_vals) {
p <- ggplot(plotdf, aes(x = norm_func(cer), y = norm_func(par))) +
geom_point(aes(color = name), size = .size) +
geom_abline(color = "gold", slope = slope, intercept = 0) +
geom_abline(color = "navy", slope = 1, intercept = 0) +
ggtitle(paste("slope = ", round(slope, 2),
"\nl2fc = ", round(log2(fold_change), 2),
"\npval = ", round(sprow[4] + 5*10^(-5), 4))) + # this is essentially ceiling(sprow[4], digits = 4) except ceiling doesn't allow decimals
xlim(c(min_expr, max_expr)) +
ylim(c(min_expr, max_expr)) +
xlab(paste("Scer expression", units_text)) +
ylab(paste("Spar expression", units_text)) +
theme_classic()
}
if (!.fitted_vals) {
# filtering down to 11 tps for scale_color_brewer (ever other one then last 8)
ntps <- length(unique(plotdf$time_point_num))
if (ntps > 11) {
good_tps <- unique(plotdf$time_point_num)[order(unique(plotdf$time_point_num))][c(1, 3, 5, (ntps-7):ntps)]
plotdf <- filter(plotdf, time_point_num %in% good_tps)
}
plotdf$time_point_num <- as.factor(plotdf$time_point_num)
p <- ggplot(filter(plotdf, name != "fittedvals"), aes(x = norm_func(cer), y = norm_func(par))) +
geom_point(aes(color = time_point_num), size = .size) +
scale_color_brewer(type = "div",
palette = "RdYlGn",
direction = 1) +
geom_abline(color = "gold", slope = slope, intercept = 0) +
geom_abline(color = "navy", slope = 1, intercept = 0) +
ggtitle(paste("slope = ", round(slope, 2),
"\nl2fc = ", round(log2(fold_change), 2),
"\npval = ", round(sprow[4] + 5*10^(-5), 4))) + # this is essentially ceiling(sprow[4], digits = 4) except ceiling doesn't allow decimals
xlim(c(min_expr, max_expr)) +
ylim(c(min_expr, max_expr)) +
xlab(paste("Scer expression", units_text)) +
ylab(paste("Spar expression", units_text)) +
theme_classic()
}
return(p)
}
plotSingleGeneModelExample(test_genedf, random_gene,
.log2 = TRUE, .fitted_vals = TRUE)
## `summarise()` has grouped output by 'time_point_num'.
## You can override using the `.groups` argument.
## min and max expression: 13.1152 11.88595

plotSingleGeneModelExample(test_genedf, random_gene,
.log2 = FALSE, .fitted_vals = FALSE)
## `summarise()` has grouped output by 'time_point_num'.
## You can override using the `.groups` argument.
## min and max expression: 8872.971 3784.677

# plot of example gene diverging in level
gene_idx <- "YBR067C" # decided in Saturated Growth script
test_genedf <- bind_cols(tibble(expr = spcts$HAP4[gene_idx,],
gene_name = gene_idx,
allele_or_species = "species"),
spinfo$HAP4)
p <- plotSingleGeneModelExample(test_genedf, gene_idx, .log2 = FALSE) +
theme(legend.position = "none")
## `summarise()` has grouped output by 'time_point_num'.
## You can override using the `.groups` argument.
## no replicates for this experiment, getting moving average
## min and max expression: 2213 38.6
p
## Warning: Removed 1 row containing missing values or values
## outside the scale range (`geom_point()`).

pdf("paper_figures/ExperimentOverview/level_lfc.pdf",
width = 2.1, height = 2.5)
p
## Warning: Removed 1 row containing missing values or values
## outside the scale range (`geom_point()`).
dev.off()
## quartz_off_screen
## 2
# plot of example gene diverging in dynamics, showing high noise but going in different directions at different timepoints
gene_idx <- "YJR001W" # decided in Saturated Growth script
test_genedf <- bind_cols(tibble(expr = spcts$HAP4[gene_idx,],
gene_name = gene_idx,
allele_or_species = "species"),
spinfo$HAP4)
p <- plotSingleGeneModelExample(test_genedf, gene_idx, .log2 = FALSE) +
theme(legend.position = "none")
## `summarise()` has grouped output by 'time_point_num'.
## You can override using the `.groups` argument.
## no replicates for this experiment, getting moving average
## min and max expression: 163.6 45.33333
p
## Warning: Removed 1 row containing missing values or values
## outside the scale range (`geom_point()`).

pdf("paper_figures/ExperimentOverview/dynamics_lfc.pdf",
width = 2.1, height = 2.5)
p
## Warning: Removed 1 row containing missing values or values
## outside the scale range (`geom_point()`).
dev.off()
## quartz_off_screen
## 2
fitDatasetModels <- function(.cts, .info) {
output <- map(rownames(.cts), function(g) {
#cat(which(rownames(.cts) == g), "/", nrow(.cts), "\n")
#cat("currently processing", g, "\n")
gdf <- bind_cols(tibble(expr = .cts[g,]),
.info)
mod <- fitSingleGeneModel(gdf, g)
return(mod)
})
return(output)
}
# test for fitDatasetModels (just taking the first 10 genes from a random dataset)
random_dataset_idx <- sample(c(1:length(ExperimentNames)), 1)
test <- fitDatasetModels(spcts[[random_dataset_idx]][1:10,], spinfo[[random_dataset_idx]]) # if it runs, it passes this test
# convert one model into one row of a dataframe summarizing that model's output (gene name, name of coefficient, effect size of coefficient, pvalue for wald test of that coefficient)
# also changes lfc to l2fc using change of base: log2(fc) = log(fc)/log(2)
parseDfRowFromModel <- function(.mod, .gene_name) {
#cat("currently processing", .gene_name, "\n")
allelerow <- summary(.mod)$coefficients["allelecer",, drop = FALSE]
result <- tibble(gene_name = .gene_name,
coefficient = gsub("allele", "", rownames(allelerow)), # should always have the non-reference level (cer if par is set to reference), but this just helps us be more sure
effect_size = as.numeric(allelerow[,"Estimate"])/log(2),
pvalue = as.numeric(allelerow[,"Pr(>|z|)"]))
return(result)
}
# test for parseDfRowFromModel
random_dataset_idx <- sample(c(1:length(ExperimentNames)), 1)
random_modslist <- fitDatasetModels(spcts[[random_dataset_idx]][1:10,], spinfo[[random_dataset_idx]]) # same test from fitDatasetModels, we need the output for this test
gene_idx <- sample(c(1:10), 1)
parseDfRowFromModel(random_modslist[[gene_idx]], gene_idx)
| gene_name | coefficient | effect_size | pvalue |
|---|---|---|---|
| 9 | cer | 0.37 | 0.179 |
parseDfFromModslist <- function(.modslist) {
good_idxs <- which(!is.na(.modslist))
result <- map2(.modslist[good_idxs], names(.modslist)[good_idxs], function(mod, gname) {
return(parseDfRowFromModel(mod, gname))
}) |> purrr::reduce(.f = bind_rows)
return(result)
}
# test for parseDfFromModslist
random_dataset_idx <- sample(c(1:length(ExperimentNames)), 1)
random_modslist <- fitDatasetModels(spcts[[random_dataset_idx]][1:10,], spinfo[[random_dataset_idx]]) # same test from fitDatasetModels, we need the output for this test
names(random_modslist) <- c(1:10)
test <- parseDfFromModslist(random_modslist)
test
| gene_name | coefficient | effect_size | pvalue |
|---|---|---|---|
| 1 | cer | 0.13 | 0.409 |
| 2 | cer | -0.394 | 0.0516 |
| 3 | cer | 0.0573 | 0.777 |
| 4 | cer | 0.324 | 0.0722 |
| 5 | cer | 0.975 | 0.000115 |
| 6 | cer | 0.401 | 0.00904 |
| 7 | cer | 0.543 | 1.47e-07 |
| 8 | cer | 0.123 | 0.65 |
| 9 | cer | 1.22 | 3.13e-11 |
| 10 | cer | 0.796 | 1.58e-15 |
# applying to each experiment (takes ~1-2 min per dataset)
spmods <- map2(spcts, spinfo, fitDatasetModels)
spmods <- map2(spmods, spcts, function(x, y) {
names(x) <- rownames(y)
return(x)
})
# applying parseDfFromModslist to spmods
spdfs <- lapply(spmods, parseDfFromModslist)
# adding experiment name column so the 4 datasets can be combined
spdfs <- map2(spdfs, names(spdfs), function(x, y) {
x$experiment <- y
return(x)
})
spdf <- purrr::reduce(.f = bind_rows, spdfs)
# renaming coefficient so that the species models can be added to allele models in aldf below
spdf$coefficient <- "species"
# applying to each experiment (takes ~1-2 min per dataset)
almods <- map2(alcts, alinfo, fitDatasetModels)
almods <- map2(almods, alcts, function(x, y) {
names(x) <- rownames(y)
return(x)
})
# applying parseDfFromModslist and parseAvgCompsFromModslist to spmods
aldfs <- lapply(almods, parseDfFromModslist)
# cleaning some column values so aldfs and alavgs can be combined into one df
# adding experiment name column
aldfs <- map2(aldfs, names(aldfs), function(x, y) {
x$experiment <- y
return(x)
})
aldf <- purrr::reduce(.f = bind_rows, aldfs)
# renaming coefficient
aldf$coefficient <- "allele"
spaldf <- full_join(aldf, spdf, by = c("gene_name", "experiment", "coefficient", "effect_size", "pvalue"))
# slightly different nGenes in each experiment b/c different genes were filtered out by low expression in the data script
spaldf %>% filter(coefficient == "species") %>% select(experiment) %>% table()
## experiment
## CC Cold HAP4 Heat LowN LowPi
## 3960 4052 4290 4172 4412 4386
spaldf %>% filter(coefficient == "allele") %>% select(experiment) %>% table()
## experiment
## CC Cold HAP4 Heat LowN LowPi
## 3960 4052 4289 4172 4412 4387
# creating genedf, where each row is one gene
# (this isn't used much in the final code version, but it is used in a few places, so keeping it for continuity)
genedf <- spaldf |>
pivot_wider(names_from = c("experiment", "coefficient"), values_from = c("effect_size", "pvalue"), id_cols = c("gene_name"))
table(genedf$gene_name) |> table() # checking that every gene is only represented once in genedf
##
## 1
## 4863
There are still some QC/exploration plots below, but none of it affects the output of this script, so we are exporting models here:
save(spcts, spinfo, alcts, alinfo, spaldf, genedf, file = "data_files/single_gene_models.RData")
load(file = "data_files/single_gene_models.RData")
# Valuable functions for examining individual genes
getExprVector <- function(.gene_name, .organism = "cer",
.experiment = c("LowN", "CC", "HAP4", "LowPi")) {
if (.organism %in% c("cer", "par")) {
common_idxs <- purrr::reduce(map(spcts[.experiment], rownames), .f = intersect)
if(!.gene_name %in% common_idxs) {
return(NULL)
}
info <- spinfo[.experiment] |> purrr::reduce(.f = bind_rows) # allows for multiple experiments
cts <- map(spcts[.experiment], \(x) x[common_idxs,]) |> purrr::reduce(.f = cbind)
expr <- cts[.gene_name, info$organism == .organism]
}
if (.organism == "hyb") {
common_idxs <- purrr::reduce(map(alcts[.experiment], rownames), .f = intersect)
if(!.gene_name %in% common_idxs) {
return(NULL)
}
info <- alinfo[.experiment] |> purrr::reduce(.f = bind_rows)
cts <- map(alcts[.experiment], \(x) x[common_idxs,]) |> purrr::reduce(.f = cbind)
expr_hyc <- cts[.gene_name, info$organism == .organism & info$allele == "cer"]
expr_hyp <- cts[.gene_name, info$organism == .organism & info$allele == "par"]
expr <- expr_hyc + expr_hyp
}
if (.organism == "hyc") {
common_idxs <- purrr::reduce(map(alcts[.experiment], rownames), .f = intersect)
if(!.gene_name %in% common_idxs) {
return(NULL)
}
info <- alinfo[.experiment] |> purrr::reduce(.f = bind_rows)
cts <- map(alcts[.experiment], \(x) x[common_idxs,]) |> purrr::reduce(.f = cbind)
expr <- cts[.gene_name, info$organism == "hyb" & info$allele == "cer"]
}
if (.organism == "hyp") {
common_idxs <- purrr::reduce(map(alcts[.experiment], rownames), .f = intersect)
if(!.gene_name %in% common_idxs) {
return(NULL)
}
info <- alinfo[.experiment] |> purrr::reduce(.f = bind_rows)
cts <- map(alcts[.experiment], \(x) x[common_idxs,]) |> purrr::reduce(.f = cbind)
expr <- cts[.gene_name, info$organism == "hyb" & info$allele == "par"]
}
return(expr)
}
# tests for getExprVector (use .experiment = ExperimentNames to get counts from all 4 experiments)
# this commented-out line should hit an error if run (TDH3 isn't in Heat/Cold):
# cer_expr <- getExprVector(.gene_name = "YGR192C", .organism = "cer", .experiment = ExperimentNames)
# should work:
cer_expr <- getExprVector(.gene_name = "YGR192C", .organism = "cer", .experiment = c("CC", "HAP4", "LowN", "LowPi"))
par_expr <- getExprVector(.gene_name = "YGR192C", .organism = "par", .experiment = c("CC", "HAP4", "LowN", "LowPi"))
hyc_expr <- getExprVector(.gene_name = "YGR192C", .organism = "hyc", .experiment = c("CC", "HAP4", "LowN", "LowPi"))
hyp_expr <- getExprVector(.gene_name = "YGR192C", .organism = "hyp", .experiment = c("CC", "HAP4", "LowN", "LowPi"))
tibble(organism = c("cer", "par", "hyc", "hyp"),
variance = map_dbl(list(cer_expr, par_expr, hyc_expr, hyp_expr), var, na.rm = TRUE),
mean = map_dbl(list(cer_expr, par_expr, hyc_expr, hyp_expr), mean, na.rm = TRUE),
median = map_dbl(list(cer_expr, par_expr, hyc_expr, hyp_expr), median, na.rm = TRUE))
| organism | variance | mean | median |
|---|---|---|---|
| cer | 5.94e+07 | 2.13e+04 | 2.03e+04 |
| par | 4.03e+06 | 4.12e+03 | 4.01e+03 |
| hyc | 5.77e+07 | 1.86e+04 | 1.78e+04 |
| hyp | 2.65e+06 | 4.71e+03 | 4.66e+03 |
getGeneDf <- function(.gene_name, .mode = c("parents", "hybrid"),
.experiment = c("LowN", "CC", "HAP4", "LowPi")) {
if (.mode == "parents") {
info_cer <- spinfo[.experiment] |> purrr::reduce(.f = bind_rows) |> filter(allele == "cer")
info_par <- spinfo[.experiment] |> purrr::reduce(.f = bind_rows) |> filter(allele == "par")
cts_cer <- getExprVector(.gene_name, .experiment = .experiment, .organism = "cer")
cts_par <- getExprVector(.gene_name, .experiment = .experiment, .organism = "par")
}
if (.mode == "hybrid") {
info_cer <- alinfo[.experiment] |> purrr::reduce(.f = bind_rows) |> filter(allele == "cer")
info_par <- alinfo[.experiment] |> purrr::reduce(.f = bind_rows) |> filter(allele == "par")
cts_cer <- getExprVector(.gene_name, .experiment = .experiment, .organism = "hyc")
cts_par <- getExprVector(.gene_name, .experiment = .experiment, .organism = "hyp")
}
if (is.null(cts_cer) | is.null(cts_par)) {
return(NULL)
}
genedf <- bind_rows(bind_cols(tibble(expr = cts_cer),
select(info_cer, experiment, time_point_num, allele)),
bind_cols(tibble(expr = cts_par),
select(info_par, experiment, time_point_num, allele))) |>
summarise(expr = mean(expr), .by = c("experiment", "time_point_num", "allele")) |>
pivot_wider(id_cols = c("experiment", "time_point_num"),
names_from = allele, values_from = expr)
return(drop_na(genedf))
}
# tests for getGeneDf
test <- NULL
while (is.null(test)) {
gene_idx <- sample(rownames(spcts[[1]]), 1)
test <- getGeneDf(gene_idx, .mode = "parents", .experiment = c("CC", "HAP4", "LowN", "LowPi"))
}
ggplot(test, aes(x = cer, y = par)) + geom_point(aes(color = experiment)) + ggtitle(gene_idx) + geom_abline(color = "gold")

test <- getGeneDf(gene_idx, .mode = "hybrid", .experiment = c("CC", "HAP4", "LowN", "LowPi")) # hybrid should be less variable because cer and par measurements were collected in the same cell
ggplot(test, aes(x = cer, y = par)) + geom_point(aes(color = experiment)) + ggtitle(gene_idx) + geom_abline(color = "gold")

spaldf %>% filter(gene_name == gene_idx)
| gene_name | coefficient | effect_size | pvalue | experiment |
|---|---|---|---|---|
| YGL161C | allele | 0.216 | 0.0115 | LowN |
| YGL161C | allele | 0.481 | 0.0351 | CC |
| YGL161C | allele | 0.403 | 0.117 | HAP4 |
| YGL161C | allele | 0.692 | 1.67e-08 | LowPi |
| YGL161C | allele | 0.378 | 0.151 | Heat |
| YGL161C | allele | 0.451 | 0.00669 | Cold |
| YGL161C | species | 0.598 | 1.72e-12 | LowN |
| YGL161C | species | 0.457 | 0.000143 | CC |
| YGL161C | species | 0.273 | 0.325 | HAP4 |
| YGL161C | species | 0.923 | 2.16e-06 | LowPi |
| YGL161C | species | 0.615 | 7.56e-05 | Heat |
| YGL161C | species | 0.846 | 6.58e-08 | Cold |
# Visualizing expression variability and divergence in...
# A) boxplots (divergence readily apparent, variability hardly apparent)
# B) scatterplots (variability more apparent, divergence also apparent, so might just stick to scatterplots alone)
visualizeGeneExpressionBoxplot <- function(.name, .experiment = c("CC", "HAP4", "LowN", "LowPi")) {
aldf <- getGeneDf(.name, "hybrid", .experiment = .experiment)
spdf <- getGeneDf(.name, "parents", .experiment = .experiment)
aldf$allele_or_species <- "allele"
spdf$allele_or_species <- "species"
plotdf <- bind_rows(aldf, spdf) %>% pivot_longer(cols = c("cer", "par"), names_to = "allele", values_to = "expr")
output <- ggplot(plotdf, aes(x = paste(allele_or_species, allele), y = log(expr))) +
geom_boxplot(aes(fill = paste(allele_or_species, allele))) +
ggtitle(.name) + xlab(element_blank()) + ylab("log(expression level)") +
scale_x_discrete(limits = c("species cer", "species par", "allele cer", "allele par")) + # this switches the order
theme_classic() + scale_fill_manual(breaks = c("species cer", "species par", "allele cer", "allele par"), # this sets the colors to match (yes you need breaks even though you've just specified the order above)
values = c("gold", "mediumorchid", "lemonchiffon", "plum")) +
theme(legend.position = "none")
return(output)
}
visualizeGeneExpressionBoxplot("YGR192C")

visualizeGeneExpressionScatterplot <- function(.gdf, .plotname = "individual gene counts", .color_from = "experiment", .log = FALSE) {
if (!.log) {
max_expr <- max(c(.gdf$cer, .gdf$par), na.rm = TRUE)
output <- ggplot(.gdf, aes(x = cer, y = par)) +
geom_point(aes(color = pull(.gdf[, as.character(.color_from)]))) + geom_abline(color = "gold") +
ggtitle(.plotname) + xlab("expression level in cerevisiae") + ylab("expression level in paradoxus") +
theme_classic() + xlim(c(0, max_expr)) + ylim(c(0, max_expr)) + theme(legend.title = element_blank())
return(output)
}
if (.log) {
max_expr <- max(c(log(.gdf$cer), log(.gdf$par)), na.rm = TRUE)
output <- ggplot(.gdf, aes(x = log(cer), y = log(par))) +
geom_point(aes(color = pull(.gdf[, as.character(.color_from)]))) + geom_abline(color = "gold") +
ggtitle(.plotname) + xlab("expression level in cerevisiae") + ylab("expression level in paradoxus") +
theme_classic() + xlim(c(0, max_expr)) + ylim(c(0, max_expr)) + theme(legend.title = element_blank())
return(output)
}
}
# TDH3
getGeneDf("YGR192C", .mode = "parents") %>% visualizeGeneExpressionScatterplot(.plotname = "YGR192C - parents")

getGeneDf("YGR192C", .mode = "hybrid") %>% visualizeGeneExpressionScatterplot(.plotname = "YGR192C - hybrid")

getGeneDf("YGR192C", .mode = "parents") %>% visualizeGeneExpressionScatterplot(.plotname = "YGR192C - parents - log scale", .log = TRUE)

# random gene
test <- NULL
test_hyb <- NULL
while (is.null(test) | is.null(test_hyb)) {
gene_idx <- sample(rownames(spcts[[1]]), 1)
test <- getGeneDf(gene_idx, .mode = "parents", .experiment = c("CC", "HAP4", "LowN", "LowPi"))
test_hyb <- getGeneDf(gene_idx, .mode = "hybrid", .experiment = c("CC", "HAP4", "LowN", "LowPi"))
}
getGeneDf(gene_idx, .mode = "parents") %>% visualizeGeneExpressionScatterplot(.plotname = paste(gene_idx, "parents", sep = " - "))

getGeneDf(gene_idx, .mode = "hybrid") %>% visualizeGeneExpressionScatterplot(.plotname = paste(gene_idx, "hybrid", sep = " - "))
# QC: picking threshold for calling sig diverged between cer and par
# Goal: We know there's no true biological threshold for when a gene is or is not diverged,
# but we want to draw a threshold beyond which we are fairly certain that the gene is truly diverged
# and use genes beyond that threshold as a SAMPLE of diverged genes to ask if there's anything unique about them
p_thresh <- 0.05/(nrow(spcts[[1]])*6) # correcting for multiple tests: 6 experiments + the full model
# eff_thresh <- 1
# Volcano plot
library(ggExtra)
ggMarginal(ggplot(spaldf, aes(x = effect_size, y = -log(pvalue))) +
geom_point(aes(color = pvalue < p_thresh)) +
theme(legend.position = "none") + geom_vline(xintercept = c(-1, 1), color = "gold"),
margins = "both", groupColour = TRUE, groupFill = TRUE)
## Warning: Removed 10 rows containing non-finite outside the scale
## range (`stat_density()`).
## Removed 10 rows containing non-finite outside the scale
## range (`stat_density()`).
