MDD_GWAS_Analysis

Karin Funaki

library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.4.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   4.0.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(cluster)
## Warning: package 'cluster' was built under R version 4.4.3

GWAS Data Preparation

data <- read.csv("MDDsymptoms GSEM 2024-02.xlsx - S8 Genetic multiple regre.csv")
head(data)
##   Phenotype    Factor    Model STD_Genotype STD_Genotype_SE      p_value
## 1    AlcDep  Clinical Multiple   0.12084261      0.37047696 7.440459e-01
## 2    AlcDep  Clinical   Single   0.47274439      0.07271519 7.960000e-11
## 3    AlcDep Community Multiple   0.55399853      0.45147597 2.197182e-01
## 4    AlcDep Community   Single   0.47709732      0.08223088 6.560000e-09
## 5    AlcDep    Gating Multiple   0.02508362      0.13430285 8.517884e-01
## 6    AlcDep    Gating   Single   0.56751584      0.13250806 1.850000e-05
##        fdr
## 1 1.00e+00
## 2 1.05e-09
## 3 1.00e+00
## 4 7.41e-08
## 5 1.00e+00
## 6 1.70e-04

Single Models

data_single <- data %>% filter(Model == "Single")
head(data_single)
##   Phenotype          Factor  Model STD_Genotype STD_Genotype_SE  p_value
## 1    AlcDep        Clinical Single    0.4727444      0.07271519 7.96e-11
## 2    AlcDep       Community Single    0.4770973      0.08223088 6.56e-09
## 3    AlcDep          Gating Single    0.5675158      0.13250806 1.85e-05
## 4    AlcDep Appetite/Weight Single    0.5459750      0.09422033 6.85e-09
## 5    AlcDep      Depression Single    0.4723776      0.07280862 8.69e-11
## 6   Anxiety        Clinical Single    0.7067274      0.05026003 6.56e-45
##        fdr
## 1 1.05e-09
## 2 7.41e-08
## 3 1.70e-04
## 4 7.61e-08
## 5 1.12e-09
## 6 6.04e-43
#Matrix of standardized genotype slope in the linear regression model in Factor vs. Phenotype 
data_genotype <- data_single %>% select("Factor", "Phenotype", "STD_Genotype", "fdr") %>% mutate(fdr = as.numeric(fdr), sig_label = case_when(fdr < 0.001 ~ "***", fdr < 0.01 ~ "**", fdr < 0.05 ~ "*", TRUE ~ ""))

ggplot(data_genotype, aes(x = Phenotype, y = Factor, fill = STD_Genotype)) + geom_tile() +  scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0, limits = c(-max(data_genotype$STD_Genotype), max(data_genotype$STD_Genotype))) + ggtitle("Heatmap of Standardized Slope of Single Regression Models") + geom_text(aes(label = sig_label))

#For Multiple Regression Models:
data_multiple <- data %>% filter(Model == "Multiple")
head(data_single)
##   Phenotype          Factor  Model STD_Genotype STD_Genotype_SE  p_value
## 1    AlcDep        Clinical Single    0.4727444      0.07271519 7.96e-11
## 2    AlcDep       Community Single    0.4770973      0.08223088 6.56e-09
## 3    AlcDep          Gating Single    0.5675158      0.13250806 1.85e-05
## 4    AlcDep Appetite/Weight Single    0.5459750      0.09422033 6.85e-09
## 5    AlcDep      Depression Single    0.4723776      0.07280862 8.69e-11
## 6   Anxiety        Clinical Single    0.7067274      0.05026003 6.56e-45
##        fdr
## 1 1.05e-09
## 2 7.41e-08
## 3 1.70e-04
## 4 7.61e-08
## 5 1.12e-09
## 6 6.04e-43
#Matrix of standardized genotype slope in the linear regression model in Factor vs. Phenotype 
data_genotype_multiple <- data_multiple %>% select("Factor", "Phenotype", "STD_Genotype", "fdr") %>% mutate(fdr = as.numeric(fdr), sig_label = case_when(fdr < 0.001 ~ "***", fdr < 0.01 ~ "**", fdr < 0.05 ~ "*", TRUE ~ ""))

ggplot(data_genotype_multiple, aes(x = Phenotype, y = Factor, fill = STD_Genotype)) + geom_tile() +  scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0, limits = c(-max(data_genotype_multiple$STD_Genotype), max(data_genotype_multiple$STD_Genotype))) + ggtitle("Heatmap of Standardized Slope of Multile Regression Models") + geom_text(aes(label = sig_label))

data_genotype_multiple %>% filter(sig_label != "") %>% ggplot(aes(x = Phenotype, y = STD_Genotype, fill = Phenotype)) + geom_col() + facet_wrap(~ Factor, scales = "free_y") + theme(axis.text.x = element_text(angle = 45, hjust = 1))

multiple_matrix <- data_multiple %>% select(Phenotype, Factor, STD_Genotype) %>% pivot_wider(names_from = Phenotype, values_from = STD_Genotype, id_cols = Factor) %>% column_to_rownames(var = "Factor")
correlation_matrix <- cor(multiple_matrix, method = "spearman")
distance_matrix <- as.dist(1 - correlation_matrix)
hier_clust_ward <- hclust(distance_matrix, method = "ward.D2")
plot(hier_clust_ward,main = "Hierarchical Clustering of MDD Phenotypes with Ward Method")

hier_clust_avg <- hclust(distance_matrix, method = "average")
plot(hier_clust_avg,main = "Hierarchical Clustering of MDD Phenotypes with Average Method")

Both ward and average linkage methods produced dendrograms with the same order.

#Deciding the Number of Clusters

heights_ward <- hier_clust_ward$height
plot(seq_along(heights_ward), rev(heights_ward), type = "b", xlab = "Number of Clusters", ylab = "Height", main = "Elbow Plot for the Ward Method")

heights_avg <- hier_clust_avg$height
plot(seq_along(heights_avg), rev(heights_avg), type = "b", xlab = "Number of Clusters", ylab = "Height", main = "Elbow Plot for the Average Method")

I chose k=3 since that is where the elbow plots start to become flat.

#Comparing the Ward Method with the Average Method

clusters_ward <- cutree(hier_clust_ward, k=3)
clusters_avg <- cutree(hier_clust_avg, k=3)

sil_ward <- silhouette(clusters_ward, distance_matrix)
avg_sil_ward <- summary(sil_ward)$avg.width
avg_sil_ward
## [1] 0.9125348
sil_avg <- silhouette(clusters_avg, distance_matrix)
avg_sil_avg <- summary(sil_avg)$avg.width
avg_sil_avg
## [1] 0.9125348

The average silhouette widths are the same for both ward and average methods, indicating that the clusters in this data are not sensitive to the linkage criterion. A silhouette width of 0.91 suggests strong clusters.

print(clusters_ward)
##  AlcDep Anxiety     BIP     BMI      EA      MD     MDD     Neu    PTSD    Pain 
##       1       2       2       3       2       2       2       2       2       1 
##   Sleep Smoking 
##       1       3
print(clusters_avg)
##  AlcDep Anxiety     BIP     BMI      EA      MD     MDD     Neu    PTSD    Pain 
##       1       2       2       3       2       2       2       2       2       1 
##   Sleep Smoking 
##       1       3

Affective and psychiatric phenotypes such as Anxiety, PTSD, Bipolar Disorder, Neuroticism, and Major Depression are in the same cluster as the Major Depressive Disorder as well as Educational Attainment. Some behavioral and external phenotypes including Chronic Pain, Alcohol Dependence, and Sleep are in another cluster. BMI and Smoking are in their own cluster, separated from other behavioral and external health-related phenotypes.Because I used correlation matrix for hierarchical clustering, the results show that the phenotypes in each cluster share similar patterns of genetic influence on the MDD latent factors, suggesting polygenic structures among them.