RNASeq Navarra

https://irycisbioinfo.github.io/ReportNavarraRNASeq/

RNASeq Report

All the data of this analysis can be found in https://github.com/irycisBioinfo/ReportNavarraRNASeq

library(DESeq2)
library(tidyverse)
library(ggrepel)
library(ggsci)

Load Data

Data from primary analisis workflow https://nf-co.re/rnaseq/3.14.0/ are loaded among metadata.

reads_genes <- read_tsv("/storage2/Hincode/S-60-47-61-2024-RNASeq_0240725_LH00206_0043_A22MLN2LT3/navarra/results_strand/star_salmon/salmon.merged.gene_counts_length_scaled.tsv")

reads_transcripts <- read_tsv("/storage2/Hincode/S-60-47-61-2024-RNASeq_0240725_LH00206_0043_A22MLN2LT3/navarra/results_strand/star_salmon/salmon.merged.transcript_counts.tsv")

ids <- read_tsv("/storage2/Hincode/S-60-47-61-2024-RNASeq_0240725_LH00206_0043_A22MLN2LT3/navarra/results_strand/star_salmon/salmon_tx2gene.tsv", col_names = c("Transcrip_ID","Gene_ID","Gene_Name"))

metadata <- read_csv("/storage2/Hincode/S-60-47-61-2024-RNASeq_0240725_LH00206_0043_A22MLN2LT3/navarra/metadata.csv")

mart <- biomaRt::useMart(biomart = "ENSEMBL_MART_ENSEMBL",
                dataset = "hsapiens_gene_ensembl",
                host = 'https://www.ensembl.org')

ttg <- biomaRt::getBM(
  attributes = c("ensembl_transcript_id","ensembl_gene_id","external_gene_name", "description",
                 "entrezgene_id","gene_biotype","transcript_biotype"),
  mart = mart)

Quality Control from the primary analysis could be found https://irycisbioinfo.github.io/ReportNavarraRNASeq/multiqc/star_salmon/multiqc_report.html

The analysis workflow aligns the reads to the reference genome’s transcripts. This allows us to perform analysis at two levels: the transcript level and the gene level, which provides a summary of all transcripts associated with each gene.

Transcript Level

Preparing data format

table_sum <- reads_transcripts %>% 
  pivot_longer(names_to = "Sample", values_to = "counts", -(tx:gene_id)) %>% 
  group_by(tx,gene_id) %>% 
  mutate(TotalGeneCounts = sum(counts), 
         TotalZerosCounts = sum(counts==0))
  
table_sum %>% ggplot(aes(x = TotalGeneCounts)) + geom_histogram() + geom_vline(xintercept = 10)+scale_x_log10()

Seen the distribution of the read counts we decide to filter those transcript with less than 10 reads or that are 0 in more than 3 samples.

table_counts <- table_sum %>%
  ungroup() %>% 
  filter(TotalGeneCounts > 10) %>% 
  filter(TotalZerosCounts < 3) %>% 
  dplyr::select(tx,Sample,counts) %>% 
  pivot_wider(names_from = Sample, values_from = counts, values_fill = 0) %>% 
  column_to_rownames("tx") %>% 
  round() %>% 
  as.matrix()

Creating a regression model and performing Differential Expression Analysis

dds_transcript <-DESeqDataSetFromMatrix(countData = table_counts, 
                                   colData = metadata %>% column_to_rownames("Sample"), 
                                   design = ~Grupo)

converting counts to integer mode

Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
design formula are characters, converting to factors

dds_transcript <- DESeq(dds_transcript,sfType = "poscounts", fitType = "mean")

estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

Quality Control

Check the normalization and DE results

reads_transcripts %>% pivot_longer(names_to = "Sample", values_to = "counts", -(tx:gene_id)) %>% 
  group_by(Sample) %>% 
  summarise(TotalCounts = sum(counts)) %>% 
  full_join(metadata) %>% 
  ggplot(aes(x= Sample, y = TotalCounts, fill = Grupo)) + geom_col() + coord_flip() + theme_light()+ labs(title = "Raw Data")

Joining with `by = join_by(Sample)`

counts(dds_transcript, normalized = T) %>% as_tibble(rownames = "tx") %>% pivot_longer(names_to = "Sample", values_to = "counts", -tx) %>% 
  group_by(Sample) %>% 
  summarise(TotalCounts = sum(counts)) %>% 
  full_join(metadata) %>% 
  ggplot(aes(x= Sample, y = TotalCounts, fill = Grupo)) + geom_col() + coord_flip() + theme_light() + labs(title = "Normalize Data")

Joining with `by = join_by(Sample)`

mds <- vegan::metaMDS(counts(dds_transcript, normalized = T) %>% t())

Square root transformation
Wisconsin double standardization
Run 0 stress 0 
Run 1 stress 0 
... Procrustes: rmse 0.0291793  max resid 0.04557835 
Run 2 stress 9.548865e-05 
... Procrustes: rmse 0.2081779  max resid 0.290838 
Run 3 stress 0.0003889057 
... Procrustes: rmse 0.1453818  max resid 0.2098872 
Run 4 stress 5.617026e-05 
... Procrustes: rmse 0.2247752  max resid 0.4649492 
Run 5 stress 8.098997e-05 
... Procrustes: rmse 0.2104377  max resid 0.3823652 
Run 6 stress 9.942537e-05 
... Procrustes: rmse 0.08345882  max resid 0.1133512 
Run 7 stress 0 
... Procrustes: rmse 0.06501365  max resid 0.1200745 
Run 8 stress 0 
... Procrustes: rmse 0.07343435  max resid 0.1365268 
Run 9 stress 0 
... Procrustes: rmse 0.1403157  max resid 0.2655851 
Run 10 stress 9.327804e-05 
... Procrustes: rmse 0.2119679  max resid 0.4193939 
Run 11 stress 9.734295e-05 
... Procrustes: rmse 0.07975466  max resid 0.1116435 
Run 12 stress 2.928156e-05 
... Procrustes: rmse 0.1756845  max resid 0.3418646 
Run 13 stress 4.334659e-05 
... Procrustes: rmse 0.1720234  max resid 0.3282976 
Run 14 stress 0 
... Procrustes: rmse 0.1460413  max resid 0.2309672 
Run 15 stress 9.944218e-06 
... Procrustes: rmse 0.1606056  max resid 0.2815027 
Run 16 stress 0 
... Procrustes: rmse 0.09459989  max resid 0.185337 
Run 17 stress 0 
... Procrustes: rmse 0.0997192  max resid 0.1768916 
Run 18 stress 8.52166e-05 
... Procrustes: rmse 0.2131113  max resid 0.3332029 
Run 19 stress 9.190148e-05 
... Procrustes: rmse 0.2257334  max resid 0.461816 
Run 20 stress 9.751302e-05 
... Procrustes: rmse 0.08435188  max resid 0.1137656 
*** Best solution was not repeated -- monoMDS stopping criteria:
     1: no. of iterations >= maxit
    19: stress < smin

Warning in vegan::metaMDS(counts(dds_transcript, normalized = T) %>% t()):
stress is (nearly) zero: you may have insufficient data

mds$points %>% 
  as.data.frame() %>% 
  rownames_to_column("Sample") %>% 
  inner_join(metadata) %>% 
  ggplot(aes(x = MDS1, y = MDS2, fill = Grupo, label =Sample)) + geom_label() +
  theme_light()

Joining with `by = join_by(Sample)`

The X Group seens more heteregenous than the group P. This get worse the quality of the Differential Expression Test.

Differential Expression Test

resultsNames(dds_transcript)

[1] "Intercept"    "Grupo_X_vs_P"

de_test <- results(dds_transcript) %>% 
  as_tibble(rownames = "Transcrip_ID") %>% inner_join(ids)

Joining with `by = join_by(Transcrip_ID)`

de_test%>% 
  mutate(sig = ifelse(padj < 0.005,"Significative","No-Significative")) %>%
  mutate(label = ifelse(sig == "Significative",Gene_Name,NA)) %>% 
  ggplot(aes(x= log2FoldChange, 
             y = -log10(padj), 
             color = sig, 
             size = sqrt(baseMean), 
             label = label)) +
  geom_point(alpha = 0.5) +
  geom_text_repel(size = 2)+
  scale_color_d3() +
  theme_light() + 
  labs(title = "Volcano Plot Grupo_X_vs_P")

Warning: Removed 7693 rows containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 94886 rows containing missing values or values outside the scale range
(`geom_text_repel()`).

Warning: ggrepel: 49 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

de_test%>% 
  mutate(sig = ifelse(padj < 0.005,"Significative","No-Significative")) %>%
  mutate(label = ifelse(sig == "Significative",Gene_Name,NA)) %>% 
  ggplot(aes(x= baseMean, 
             y = log2FoldChange, 
             color = sig, 
             size = sqrt(baseMean), 
             label = label)) +
  geom_point(alpha = 0.5) +
  geom_text_repel(size = 1)+
  scale_color_d3() +
  scale_x_log10()+
  theme_light() + 
  labs(title = "MA Plot Grupo_X_vs_P")

Warning: Removed 7693 rows containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 94886 rows containing missing values or values outside the scale range
(`geom_text_repel()`).

Warning: ggrepel: 13 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

Volcano and MA plot shown several transcript differential expressed. In order to see better this differences we plot all the comparison individually.

significativos <- de_test%>% 
  filter(padj < 0.005) %>% pull(Transcrip_ID)

counts(dds_transcript, normalized = T) %>% 
  as.data.frame() %>% 
  rownames_to_column("tx") %>% 
  filter(tx %in% significativos) %>% 
  pivot_longer(names_to = "Sample",values_to = "counts", -tx) %>% 
  inner_join(metadata) %>% 
  ggplot(aes(x = Grupo, y = counts, fill = Grupo)) + geom_boxplot() + facet_wrap(~tx, scales = "free_y") + scale_fill_d3() + theme_light()

Joining with `by = join_by(Sample)`

Funtional Annotation

To create a biological context for the differential expressed transcript we are traying to infer if there are some funtional annotation (GO Terms) or pathway (KEGG) over-represented in the transcript-set.

library(clusterProfiler)

clusterProfiler v4.12.6 Learn more at https://yulab-smu.top/contribution-knowledge-mining/

Please cite:

T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan,
X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal
enrichment tool for interpreting omics data. The Innovation. 2021,
2(3):100141


Adjuntando el paquete: 'clusterProfiler'

The following object is masked from 'package:purrr':

    simplify

The following object is masked from 'package:IRanges':

    slice

The following object is masked from 'package:S4Vectors':

    rename

The following object is masked from 'package:stats':

    filter

library(org.Hs.eg.db)

Cargando paquete requerido: AnnotationDbi


Adjuntando el paquete: 'AnnotationDbi'

The following object is masked from 'package:clusterProfiler':

    select

The following object is masked from 'package:dplyr':

    select

sig_transcripts <- de_test %>% 
  inner_join(ttg, by = c("Transcrip_ID" = "ensembl_transcript_id")) %>% filter(padj < 0.005) %>% 
  dplyr::select(entrezgene_id) %>% 
    distinct() %>% 
    drop_na() %>%  
    pull(entrezgene_id)
  
GO_enrichment <- enrichGO(gene =sig_transcripts,
                                   OrgDb         = org.Hs.eg.db,
                                    ont           = "ALL",
                                    pAdjustMethod = "BH",
                                    pvalueCutoff  = 0.01,
                                    qvalueCutoff  = 0.05,
                                    readable      = TRUE)

if ((GO_enrichment %>% as.data.frame() %>% nrow()) > 0  )
{
  barplot(GO_enrichment, showCategory = 20)
} else{
  print("No enriched Pathways")
}

[1] "No enriched Pathways"

KEGG_enrichment <- enrichKEGG(gene = sig_transcripts,
                                        organism     = 'hsa',
                                        pvalueCutoff = 0.05)

Reading KEGG annotation online: "https://rest.kegg.jp/link/hsa/pathway"...

Reading KEGG annotation online: "https://rest.kegg.jp/list/pathway/hsa"...

if ((KEGG_enrichment %>% as.data.frame() %>% nrow()) > 0  )
{
  barplot(KEGG_enrichment, showCategory = 20)
} else{
  print("No enriched Pathways")
}

[1] "No enriched Pathways"

There are not over-represented funtional annotation or pathway.

Gene Level

At this point we repeat de analysis but at Gene Level.

table_sum <- reads_genes %>% 
  pivot_longer(names_to = "Sample", values_to = "counts", -(gene_id:gene_name)) %>% 
  group_by(gene_id) %>% 
  mutate(TotalGeneCounts = sum(counts), 
         TotalZerosCounts = sum(counts==0))
  
table_sum %>% ggplot(aes(x = TotalGeneCounts)) + geom_histogram() + geom_vline(xintercept = 10)+scale_x_log10()

Warning in scale_x_log10(): log-10 transformation introduced infinite values.

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Warning: Removed 144900 rows containing non-finite outside the scale range
(`stat_bin()`).

table_counts <- table_sum %>% 
  ungroup() %>% 
  filter(TotalGeneCounts > 10) %>% 
  filter(TotalZerosCounts < 3) %>% 
  dplyr::select(gene_id,Sample,counts) %>% 
  pivot_wider(names_from = Sample, values_from = counts, values_fill = 0) %>% 
  column_to_rownames("gene_id") %>% 
  round() %>% 
  as.matrix()

We use the same criteria to filter low quality genes.

dds_genes <-DESeqDataSetFromMatrix(countData = table_counts, 
                                   colData = metadata %>% column_to_rownames("Sample"), 
                                   design = ~Grupo)

converting counts to integer mode

Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
design formula are characters, converting to factors

dds_genes <- DESeq(dds_genes,sfType = "poscounts", fitType = "mean")

estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

Quality Control

reads_genes %>% pivot_longer(names_to = "Sample", values_to = "counts", -(gene_id:gene_name)) %>% 
  group_by(Sample) %>% 
  summarise(TotalCounts = sum(counts)) %>% 
  full_join(metadata) %>% 
  ggplot(aes(x= Sample, y = TotalCounts, fill = Grupo)) + geom_col() + coord_flip() + theme_light()+ labs(title = "Raw Data")

Joining with `by = join_by(Sample)`

counts(dds_genes, normalized = T) %>% as_tibble(rownames = "genes_id") %>% pivot_longer(names_to = "Sample", values_to = "counts", -genes_id) %>% 
  group_by(Sample) %>% 
  summarise(TotalCounts = sum(counts)) %>% 
  full_join(metadata) %>% 
  ggplot(aes(x= Sample, y = TotalCounts, fill = Grupo)) + geom_col() + coord_flip() + theme_light() + labs(title = "Normalize Data")

Joining with `by = join_by(Sample)`

pca <- vegan::metaMDS(counts(dds_genes, normalized = T) %>% t())

Square root transformation
Wisconsin double standardization
Run 0 stress 3.011127e-06 
Run 1 stress 6.479562e-06 
... Procrustes: rmse 0.1722707  max resid 0.2337154 
Run 2 stress 9.960007e-05 
... Procrustes: rmse 0.1717837  max resid 0.3605271 
Run 3 stress 9.864728e-05 
... Procrustes: rmse 0.09142828  max resid 0.1268886 
Run 4 stress 9.846717e-05 
... Procrustes: rmse 0.06694584  max resid 0.1107112 
Run 5 stress 0.0002527055 
... Procrustes: rmse 0.2839842  max resid 0.4818229 
Run 6 stress 0 
... New best solution
... Procrustes: rmse 0.1247765  max resid 0.2513806 
Run 7 stress 9.870621e-05 
... Procrustes: rmse 0.1013454  max resid 0.192716 
Run 8 stress 8.199781e-05 
... Procrustes: rmse 0.138157  max resid 0.1901614 
Run 9 stress 9.471227e-05 
... Procrustes: rmse 0.1359267  max resid 0.207019 
Run 10 stress 9.618336e-05 
... Procrustes: rmse 0.217148  max resid 0.3902685 
Run 11 stress 9.473049e-05 
... Procrustes: rmse 0.1148755  max resid 0.2127429 
Run 12 stress 0.0007052933 
Run 13 stress 9.536384e-05 
... Procrustes: rmse 0.1207609  max resid 0.2144415 
Run 14 stress 8.61864e-05 
... Procrustes: rmse 0.1503647  max resid 0.2618885 
Run 15 stress 9.920266e-05 
... Procrustes: rmse 0.2216172  max resid 0.2976934 
Run 16 stress 0.0007465923 
Run 17 stress 0.276134 
Run 18 stress 9.766822e-05 
... Procrustes: rmse 0.1251696  max resid 0.1791651 
Run 19 stress 0.0001227039 
... Procrustes: rmse 0.2613389  max resid 0.4705428 
Run 20 stress 6.484234e-05 
... Procrustes: rmse 0.1479421  max resid 0.2603061 
*** Best solution was not repeated -- monoMDS stopping criteria:
     4: no. of iterations >= maxit
    15: stress < smin
     1: stress ratio > sratmax

Warning in vegan::metaMDS(counts(dds_genes, normalized = T) %>% t()): stress is
(nearly) zero: you may have insufficient data

pca$points %>% 
  as.data.frame() %>% 
  rownames_to_column("Sample") %>% 
  inner_join(metadata) %>% 
  ggplot(aes(x = MDS1, y = MDS2, fill = Grupo, label =Sample)) + geom_label() +
  theme_light()

Joining with `by = join_by(Sample)`

We obtain the same results at heterogeneity level of the samples.

Differential Expression Test

resultsNames(dds_genes)

[1] "Intercept"    "Grupo_X_vs_P"

de_test <- results(dds_genes) %>% 
  as_tibble(rownames = "Gene_ID") %>% inner_join(ids)

Joining with `by = join_by(Gene_ID)`

de_test%>% 
  mutate(sig = ifelse(padj < 0.01,"Significative","No-Significative")) %>%
  mutate(label = ifelse(sig == "Significative",Gene_Name,NA)) %>% 
  ggplot(aes(x= log2FoldChange, 
             y = -log10(padj), 
             color = sig, 
             size = sqrt(baseMean), 
             label = label)) +
  geom_point(alpha = 0.5) +
  geom_text_repel(size = 1)+
  scale_color_d3() +
  theme_light() + 
  labs(title = "Volcano Plot Grupo_X_vs_P")

Warning: Removed 5373 rows containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 183875 rows containing missing values or values outside the scale range
(`geom_text_repel()`).

Warning: ggrepel: 29 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

de_test%>% 
  mutate(sig = ifelse(padj < 0.01,"Significative","No-Significative")) %>%
  mutate(label = ifelse(sig == "Significative",Gene_Name,NA)) %>% 
  ggplot(aes(x= baseMean, 
             y = log2FoldChange, 
             color = sig, 
             size = sqrt(baseMean), 
             label = label)) +
  geom_point(alpha = 0.5) +
  geom_text_repel(size = 1)+
  scale_color_d3() +
  scale_x_log10()+
  theme_light() + 
  labs(title = "MA Plot Grupo_X_vs_P")

Warning: Removed 5373 rows containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 183875 rows containing missing values or values outside the scale range
(`geom_text_repel()`).

Warning: ggrepel: 29 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

significativos <- de_test%>% 
  filter(padj < 0.01) %>% pull(Gene_Name)

counts(dds_genes, normalized = T) %>% 
  as.data.frame() %>% 
  rownames_to_column("Gene_Name") %>% 
  filter( Gene_Name %in% significativos) %>% 
  pivot_longer(names_to = "Sample",values_to = "counts", -Gene_Name) %>% 
  inner_join(metadata) %>% 
  ggplot(aes(x = Grupo, y = counts, fill = Grupo)) + geom_boxplot() + facet_wrap(~Gene_Name, scales = "free_y") + scale_fill_d3() + theme_light()

Joining with `by = join_by(Sample)`

In this case we only see two genes deferentially expresed.