Data cleaning for figure scripts

Final data cleaning steps that import level and plasticity divergence information from DESeq2 and Heiarchical clustering. The end result of this script is a dataframe, finaldf, that is used in all the figure scripts (also as Supplementary Table 1)

sapply(c("dplyr", "purrr", "tidyr", "ggpubr", "readr",
         "data.table", "ggplot2", "data.table", 
         "matrixStats"), require, character.only=TRUE)

##       dplyr       purrr       tidyr      ggpubr 
##        TRUE        TRUE        TRUE        TRUE 
##       readr  data.table     ggplot2  data.table 
##        TRUE        TRUE        TRUE        TRUE 
## matrixStats 
##        TRUE

source("functions_for_figure_scripts.R")

Defining our color schemes and category names to be used throughout the paper figures

# Environment Names, so I only have to change them in one place
ExperimentNames <- c("HAP4", "CC", "LowN", "LowPi", "Heat", "Cold")
LongExperimentNames <- c("Diauxic Shift", "Hydroxyurea Shock", "Low Nitrogen", "Low Phosphate", "Heat Stress", "Cold Stress")

# colors used throughout paper
group4_colordf <- tibble(type = c("grey80", "gold", "grey80", "gold", "grey20"),
                         labels = c("conserved level \nand plasticity", 
                                    "conserved level, \ndiverged plasticity", 
                                    "diverged level, \nconserved plasticity", 
                                    "diverged level \nand plasticity",
                                    "lowly expressed"),
                         limits = c("conserved level and plasticity", 
                                    "conserved level, diverged plasticity", 
                                    "diverged level, conserved plasticity", 
                                    "diverged level and plasticity",
                                    "lowly expressed"),
                         pattern = c("none", "none",
                                     "stripe", "stripe",
                                     "none"),
                         scheme = "group4")
experiment_colordf <- tibble(type = c("grey", "yellow", "green", "purple", "red", "blue"),
                                               labels = LongExperimentNames,
                                               limits = ExperimentNames,
                                               scheme = "experiment")
plasticity_colordf <- tibble(type = c("grey80", "grey60", "orange1", "blue2", "purple"),
                           labels = c("conserved plastic", "conserved static", "Scer-unique plasticity", "Spar-unique plasticity", "plasticity reversal"),
                           limits = c("conserved plastic", "conserved static", "Scer-unique", "Spar-unique", "reversal"),
                           scheme = "plasticity")
colordf <- bind_rows(group4_colordf, experiment_colordf, plasticity_colordf)

Creating final dataframe

Loading heiarchical clustering and DESeq2 model outputs:

load("data_files/CorrelationClustering3Disp.RData")
load("data_files/single_gene_models.RData")
load("data_files/Cleaned_Count_Data.RData")
load("data_files/Cleaned_Counts.RData")
load("data_files/Cleaned_Counts_Allele.RData")
p_thresh <- 0.05
eff_thresh <- log2(1.5) # gene needs to be 1.5x higher in one species than the other to be considered DE
eff_thresh0 <- 0
eff_thresh2 <- log2(2)

Benjamini-Hochberg FDR correction:

# Benjamini-Hochberg FDR correction of glm p-values
spaldf$padj <- p.adjust(spaldf$pvalue, method = "BH")
sum(spaldf$pvalue < p_thresh)

## [1] 25607

sum(spaldf$padj < p_thresh)

## [1] 22628

Creating mutually exclusive sets of genes based on level and plasticity divergence in parents each gene is a combination of its level category and its plasticity category ## level categories: 1) conserved: parent padj >= p_thresh 2) diverged: parent padj < p_thresh, not taking into account hybrid yet ## plasticity categories (just looking at parents): 1) conserved: 11, 22, 00 2) diverged: 12, 21, 01, 10, 02, 20

finaldf <- pivot_wider(spaldf, id_cols = c("gene_name", "experiment"), 
                       values_from = c("effect_size", "padj"),
                       names_from = "coefficient") |> 
  drop_na() |> # 1 gene missing from species, YMR107W
  inner_join(y =  clusterdf, 
             by = join_by("gene_name"=="gene_ID",
                          "experiment"))

### Removing hybrid par allele-deleted genes from CC experiment (where the deletion was present)
omit_list <- c("YLR078C", "YLR077W", "YLR074C", "YLR072W", "YLR075W", "YLR073C", # large hybrid CC paradoxus haplotype deletion
               "YNL247W", "YNL244C")
finaldf |> filter(experiment == "CC" & gene_name %in% omit_list)

## # A tibble: 8 × 8
##   gene_name experiment effect_size_allele
##   <chr>     <chr>                   <dbl>
## 1 YLR072W   CC                       8.76
## 2 YLR073C   CC                       9.16
## 3 YLR074C   CC                       9.85
## 4 YLR075W   CC                       8.26
## 5 YLR077W   CC                      42.4 
## 6 YLR078C   CC                      42.9 
## 7 YNL247W   CC                      12.0 
## 8 YNL244C   CC                       8.70
## # ℹ 5 more variables: effect_size_species <dbl>,
## #   padj_allele <dbl>, padj_species <dbl>, cer <dbl>,
## #   par <dbl>

dim(finaldf)

## [1] 25264     8

finaldf <- finaldf |> filter(!(experiment == "CC" & gene_name %in% omit_list))
dim(finaldf)

## [1] 25256     8

Adding plasticity column. If cluster in Scer is the same as cluster in Spar, plasticity is conserved. If clusters are different, plasticity is diverged.

finaldf$plasticity <- map2(finaldf$cer, finaldf$par, \(x, y) {
  if (x == y) {
    return("conserved")
  }
  if (x != y) {
    return("diverged")
  }
}) |> unlist()

Adding level column. If the DEseq2-estimated log2 fold change when you change from an Spar count to an Scer count (Spar is reference) is greater (in magnitude) than the log2 fold change threshold (eff_thresh) and the BH-adjusted p-value is less than 0.05, the gene is considered diverged in level. Using three different log2 fold change thresholds to compare the effect in a Supplemental figure.

finaldf$level <- map(c(1:nrow(finaldf)), \(i) {
  x <- finaldf$padj_species[i] |> as.numeric()
  y <- finaldf$effect_size_species[i] |> as.numeric()
  output <- if_else(x < p_thresh & abs(y) > eff_thresh,
                    true = "diverged",
                    false = "conserved")
  return(output)
}) |> unlist()
finaldf$level0 <- map(c(1:nrow(finaldf)), \(i) {
  x <- finaldf$padj_species[i] |> as.numeric()
  y <- finaldf$effect_size_species[i] |> as.numeric()
  output <- if_else(x < p_thresh & abs(y) > eff_thresh0,
                    true = "diverged",
                    false = "conserved")
  return(output)
}) |> unlist()
finaldf$level2 <- map(c(1:nrow(finaldf)), \(i) {
  x <- finaldf$padj_species[i] |> as.numeric()
  y <- finaldf$effect_size_species[i] |> as.numeric()
  output <- if_else(x < p_thresh & abs(y) > eff_thresh2,
                    true = "diverged",
                    false = "conserved")
  return(output)
}) |> unlist()

Tables of gene counts to report in main paper:

# gene counts to report
table(finaldf$plasticity, finaldf$level, useNA = "always") # checking that all observations are in the 2x2 box

##            
##             conserved diverged  <NA>
##   conserved     10020     5436     0
##   diverged       6244     3556     0
##   <NA>              0        0     0

sum(table(finaldf$level, finaldf$plasticity))

## [1] 25256

nrow(finaldf) # mutually exclusive group categories

## [1] 25256

finaldf |> filter(plasticity == "diverged") |> select(experiment) |> table()

## experiment
##    CC  Cold  HAP4  Heat  LowN LowPi 
##  1766  1786  1455  1743  1305  1745

finaldf |> filter(level == "diverged") |> select(experiment) |> table()

## experiment
##    CC  Cold  HAP4  Heat  LowN LowPi 
##  1532  1563  1238  1415  1657  1587

Tables of number of timepoints per experiment for paper figure:

sample_info |> select(experiment, time_point_str) |> 
  unique() |> 
  group_by(experiment) |> 
  summarise(n = n())

## # A tibble: 6 × 2
##   experiment     n
##   <chr>      <int>
## 1 CC             8
## 2 Cold           4
## 3 HAP4          15
## 4 Heat           4
## 5 LowN           3
## 6 LowPi         28

Calculating ortholog pair correlations for EnvironmentalPatterns figure

# between hybrid alleles and for parent-hybrid allele of the same species (Hyc/Scer and Hyp/Spar)
getCorelation <- function(.gene_name, .experiment, 
                          .cts1, .cts2, .info1, .info2) {
  common_cols <- intersect(colnames(.cts1[,.info1$experiment == .experiment]), 
                           colnames(.cts2[,.info2$experiment == .experiment]))
  output <- cor(as.numeric(.cts1[.gene_name, common_cols]),
                as.numeric(.cts2[.gene_name, common_cols]))
  return(as.numeric(output))
}
# tests for getCorelation
getCorelation("YGR192C", "HAP4",
              .cts1 = collapsed$cer,
              .cts2 = collapsed_allele$cer,
              .info1 = info,
              .info2 = info_allele)

## [1] 0.9640884

Adding correlation columns:

# adding correlation columns
finaldf$cor_hybrid <- map2(finaldf$gene_name, finaldf$experiment, getCorelation,
                           .cts1 = collapsed_allele$cer,
                           .cts2 = collapsed_allele$par,
                           .info1 = info_allele,
                           .info2 = info_allele) |> unlist()
finaldf$cor_parents <- map2(finaldf$gene_name, finaldf$experiment, getCorelation,
                            .cts1 = collapsed$cer,
                            .cts2 = collapsed$par,
                            .info1 = info,
                            .info2 = info) |> unlist()
finaldf$cor_scer <- map2(finaldf$gene_name, finaldf$experiment, getCorelation,
                         .cts1 = collapsed_allele$cer,
                         .cts2 = collapsed$cer,
                         .info1 = info_allele,
                         .info2 = info) |> unlist()
finaldf$cor_spar <- map2(finaldf$gene_name, finaldf$experiment, getCorelation,
                         .cts1 = collapsed_allele$par,
                         .cts2 = collapsed$par,
                         .info1 = info_allele,
                         .info2 = info) |> unlist()

Calculating the mean and var of each gene’s expression across both species, which is mainly relevant in a couple Supplemental/QC plots

mean:

getMean <- function(.gene_name, .experiment, 
                   .cts, .info) {
  output <- rowMeans(.cts[.gene_name, .info$experiment == .experiment, drop = FALSE])
  return(as.numeric(output))
}
# tests for getMean
getMean("YGR192C", "HAP4",
        .cts = counts,
        .info = sample_info)

## [1] 9339.933

var:

getVar <- function(.gene_name, .experiment, 
                    .cts, .info) {
  output <- rowVars(.cts[.gene_name, .info$experiment == .experiment, drop = FALSE])
  return(as.numeric(output))
}
# tests for getVar
getVar("YGR192C", "HAP4",
        .cts = counts,
        .info = sample_info)

## [1] 67159939

calculating for each gene (including both parent’s counts):

# adding mean&var columns
finaldf$mean_parents <- map2(finaldf$gene_name, finaldf$experiment, getMean,
                             .cts = counts,
                             .info = sample_info) |> unlist()
finaldf$var_parents <- map2(finaldf$gene_name, finaldf$experiment, getVar,
                             .cts = counts,
                             .info = sample_info) |> unlist()

Remember those biased genes we identified in the very first importing count data script? These were genes that had a high proportion of counts mapping to the wrong species allele. They become relevant here. We re-label any gene that has been flagged as diverged in level as “biased” instead of “diverged”

finaldf <- finaldf |> mutate(bias = if_else(gene_name %in% cer_biased_genes,
                                           true = "cer",
                                           false = if_else(gene_name %in% par_biased_genes,
                                                           true = "par",
                                                           false = if_else(gene_name %in% both_biased_genes,
                                                                           true = "both",
                                                                           false = "none")))) 
plotdf <- finaldf |> 
  filter(level == "diverged")
# effect of bias on level divergence
ggplot(plotdf, aes(x = effect_size_species)) +
  geom_density(aes(fill = bias), alpha = 0.5)

Re-labeling these biased genes:

finaldf$level <- if_else(finaldf$bias == "none",
                         true = finaldf$level,
                         false = "biased")

# effect of bias on plasticity divergence
p_none <- plotdf |> filter(bias == "none") |> 
  select(cer, par) |> table()
p_cer <- plotdf |> filter(bias == "cer") |> 
  select(cer, par) |> table()
p_par <- plotdf |> filter(bias == "par") |> 
  select(cer, par) |> table()

round(p_none/sum(p_none), digits = 2)

##    par
## cer    0    1    2
##   0 0.10 0.08 0.02
##   1 0.10 0.38 0.05
##   2 0.05 0.09 0.13

round(p_cer/sum(p_cer), digits = 2)

##    par
## cer    0    1    2
##   0 0.05 0.01 0.03
##   1 0.15 0.59 0.05
##   2 0.06 0.03 0.03

round(p_par/sum(p_par), digits = 2)

##    par
## cer    0    1    2
##   0 0.03 0.13 0.09
##   1 0.04 0.52 0.08
##   2 0.02 0.03 0.07

# proportions are the same between the three groups

Numbers for the workflow figure:

### numbers for workflow figure
nrow(finaldf) # for workflow figure E

## [1] 25256

length(unique(finaldf$gene_name)) # total number of genes

## [1] 4863

n_per_experiment <- table(finaldf$experiment)
n_per_experiment # number of genes used in each environment (number expressed > 30cpm)

## 
##    CC  Cold  HAP4  Heat  LowN LowPi 
##  3952  4051  4286  4172  4412  4383

# min filtered out for low expression:
max(n_per_experiment)

## [1] 4412

(nrow(genedf) - max(n_per_experiment))/nrow(genedf)

## [1] 0.09274111

# max filtered out for low expression:
min(n_per_experiment)

## [1] 3952

(nrow(genedf) - min(n_per_experiment))/nrow(genedf)

## [1] 0.1873329

# percent low variance per experiment:
finaldf |> group_by(experiment) |> 
  summarise(n_cer_lowvar = sum(cer == 0),
            n_par_lowvar = sum(par == 0),
            n_genes = n()*2) |> 
  mutate(pct_low_var = (n_cer_lowvar + n_par_lowvar)/n_genes)

## # A tibble: 6 × 5
##   experiment n_cer_lowvar n_par_lowvar n_genes
##   <chr>             <int>        <int>   <dbl>
## 1 CC                  315          617    7904
## 2 Cold               1585         2148    8102
## 3 HAP4                971          850    8572
## 4 Heat                833         1207    8344
## 5 LowN               1300         1196    8824
## 6 LowPi               372          253    8766
## # ℹ 1 more variable: pct_low_var <dbl>

How the choice of effect size threshold influence level divergence

This is slightly sacrilegious to put a figure plot in this data-cleaning script. But it is easier than having a separate script just for this figure. Here we compare how the three different effect size thresholds affect the number of genes considered to be diverging in expression level.

# for testing how eff_threshold affects level divergence
finaldf <- finaldf |> pivot_longer(cols = c("level0", "level", "level2"),
                                   names_to = "eff_threshold",
                                   values_to = "level")

### Barplot of 4 divergence categories
finaldf$group4 <- map2(finaldf$level, finaldf$plasticity, \(l, d) {
  if (l != "diverged" & d == "conserved") {
    return("conserved level and plasticity")
  }
  if (l != "diverged" & d == "diverged") {
    return("conserved level, diverged plasticity")
  }
  if (l == "diverged" & d == "conserved") {
    return("diverged level, conserved plasticity")
  }
  if (l == "diverged" & d == "diverged") {
    return("diverged level and plasticity")
  }
}) |> unlist()

plotdf <- finaldf
plotdf$eff_threshold <- factor(plotdf$eff_threshold, levels = c("level0", "level", "level2"),
                               labels = c("no threshold", "1.5x", "2x"))
p <- ggplot(mutate(plotdf, experiment = factor(experiment,
                                          levels = ExperimentNames)),
       aes(x = group4)) +
  geom_bar_pattern(aes(fill = group4, pattern = group4), pattern_fill = "black") +
  geom_text(stat='count', aes(label = after_stat(count)), vjust=-1) +
  scale_fill_discrete(limits =  colordf[colordf$scheme == "group4",]$limits,
                      type = colordf[colordf$scheme == "group4",]$type) +
  scale_pattern_discrete(limits =  colordf[colordf$scheme == "group4",]$limits,
                         choices = colordf[colordf$scheme == "group4",]$pattern) +
  scale_x_discrete(limits = colordf[colordf$scheme == "group4",]$limits,
                   breaks = colordf[colordf$scheme == "group4",]$limits,
                   labels = colordf[colordf$scheme == "group4",]$labels) +
  theme_classic() +
  theme(axis.text.x = element_blank(),
        legend.position = "bottom") +
  xlab("") +
  ylim(c(0, 3500)) +
  facet_grid(eff_threshold ~ experiment) +
  guides(fill = guide_legend(nrow = 3, byrow=TRUE))
p

pdf("paper_figures/Supplement/bar.pdf",
    width = 11, height = 5)
p
dev.off()

## quartz_off_screen 
##                 2

Removing level thresholds that we don’t use, just keeping the 1.5 threshold

# removing effect size threshold we don't use
finaldf <- filter(finaldf, eff_threshold == "level") |> 
  select(-eff_threshold)

Group codes

I am 99% sure these group codes are no longer used in any of the figure scripts, but keeping them in just in case. These incorporate all the level and plasticiy divergence information from the other columns into one column The idea was that these codes might effectively group certain types of genes together more than the individual properties. As we discovered in the course of the analyses of the paper, however, level and plasticity divergence do not have any relationship with each other, and the types of plasticity divergence observed tended to have more to do with the environment than the particular genes that had that divergence pattern.

### longer group codes
# gene groups factor in a) whether level is conserved, b) whether dynamics are conserved,
# and c) if level or dynamics are diverged, which direction that is happening in (i.e. up in Scer, or 2 in Scer and 1 in Spar)
# group code format: type_direction_cluster(s)
# type: <cons, lev, dyn, or levdyn>
# direction (of level divergence): <uppar, upcer, or empty>
# clusters: <0, 1, 2, 01, 02, etc.>
finaldf$group <- apply(finaldf, 1, \(x) {
  x_cer <- x["cer"] |> as.numeric()
  x_par <- x["par"] |> as.numeric()
  x_effect_size <- x["effect_size_species"] |> as.numeric()
  x_level <- x["level"] |> as.character()
  x_plasticity <- x["plasticity"] |> as.character()
  if (x_level != "diverged" & x_plasticity == "conserved") {
    type_str <- "cons"
  }
  if (x_level == "diverged" & x_plasticity == "conserved") {
    type_str <- "lev"
  }
  if (x_level != "diverged" & x_plasticity == "diverged") {
    type_str <- "dyn"
  }
  if (x_level == "diverged" & x_plasticity == "diverged") {
    type_str <- "levdyn"
  }
  if (x_level != "diverged") {
    dir_str <- NULL
  }
  if (x_level == "diverged") {
    if (x_effect_size > 0) {
      dir_str <- "upcer"
    }
    if (x_effect_size < 0) {
      dir_str <- "uppar"
    }
  }
  if (identical(x_cer, x_par)) {
    full_str <- paste0(type_str, dir_str, x_cer)
  }
  if (!identical(x_cer, x_par)) {
    full_str <- paste0(type_str, dir_str, x_cer, x_par)
  }
  return(full_str)
}) |> unlist()

Saving and exporting final data frame to Supplementary Table 1

save(finaldf, colordf, ExperimentNames,
     LongExperimentNames, file = "data_files/FinalDataframe3Disp.RData")

Renaming some column names and selecting specific finaldf columns for the supplementary table, so that it appears as close to the workflow figure as possible

supp_tab1 <- finaldf |> select(gene_name, experiment, effect_size_species, padj_species, cer, par, level, plasticity) |>
  dplyr::rename(c("gene"="gene_name", "log2_fold_change"="effect_size_species",
                  "padj"="padj_species", "cluster_Scer"="cer", "cluster_Spar"="par"))
# changing cluster names for clarity
# 0 = "static"
# 1 = "increasing"
# 2 = "decreasing"
supp_tab1$cluster_Scer <-
  if_else(supp_tab1$cluster_Scer == 0,
          true = "static",
          false = if_else(supp_tab1$cluster_Scer == 1,
                          true = "increasing",
                          false = "decreasing"))
supp_tab1$cluster_Spar <-
  if_else(supp_tab1$cluster_Spar == 0,
          true = "static",
          false = if_else(supp_tab1$cluster_Spar == 1,
                          true = "increasing",
                          false = "decreasing"))
write_delim(supp_tab1, delim = ",", file = "data_files/Supplementary_Table1.csv",
            col_names = TRUE)

Gene Ontology Enrichment

Creating a GOslim table recording all the gene ontology terms associated with each gene (filtering out a few terms that are a little too vague to be useful, such as “cellular process”).

# table of gene ontology terms associated with each gene
# each row is a term, so this is a very long table
goslim <- read.table("data_files/downloaded_genomes_and_features/go_slim_mapping.tab", header = FALSE, sep = "\t") |>
  as_tibble()
colnames(goslim) <- c("ORF", # (mandatory) - Systematic name of the gene (or gene complex if it starts with CPX-)
                      "gene", # (optional) - Gene name, if one exists
                      "SGDID", # SGDID (mandatory) - the SGDID, unique database identifier for the gene
                      "GO_aspect", # (mandatory) - which ontology: P=Process, F=Function, C=Component
                      "GOslim_term", # (mandatory) - the name of the GO term that was selected as a GO Slim term
                      "GOID", # (optional) - the unique numerical identifier of the GO term
                      "feature_type") # (mandatory) - a description of the sequence feature, such as ORF or tRNA
too_vague_terms <- c("cellular process", "molecular function", "biological process")
goslim <- filter(goslim, !(GOslim_term %in% too_vague_terms) & 
                   (ORF %in% finaldf$gene_name))
save(goslim, file = "data_files/GO_Slim.RData")