###This section for drafting up the script students will run
require(tidyverse)
Loading required package: tidyverse
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
-- Attaching packages ----------------------------------------------------------------------------- tidyverse 1.3.1 --
√ ggplot2 3.3.5 √ purrr 0.3.4
√ tibble 3.1.6 √ dplyr 1.0.8
√ tidyr 1.2.0 √ stringr 1.4.0
√ readr 2.1.2 √ forcats 0.5.1
-- Conflicts -------------------------------------------------------------------------------- tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
require(openxlsx)
Loading required package: openxlsx
require(ggplot2)
require(ggpubr)
Loading required package: ggpubr
PhenoGeno<-read.xlsx("Data_final.xlsx",1 )%>%
as.data.frame(stringsAsFactors=FALSE) ## Read the phenotype and genotypes within 1Mb of the CXCL10 gene into memory.
##Explore the dataset:
#what is the dataset size?
dim(PhenoGeno)
[1] 527 4107
#which populations are included in the study?
unique(PhenoGeno$Population)
[1] "IBS" "KHV" "GWD" "ESN"
#are they equally represented?
table(PhenoGeno$Population)
ESN GWD IBS KHV
129 179 157 62
#what's the frequency/range of CXCL10 values?
ggplot(data = PhenoGeno)+
geom_boxplot(aes(y= Chlamydia_IP10, x = Population, fill = Sex))+
#geom_histogram(aes(x=Chlamydia_IP10), binwidth = 10)+
theme_bw()
#how frequently do variants occur in the region?
PhenoGeno[-c(1:4)] %>%
colSums() %>%
enframe() %>%
na.omit()-> variant_freq
ggplot(data = variant_freq)+
geom_histogram(aes(x=value), binwidth = 10)+
xlim(100,600)+
theme_bw()
Warning: Removed 2 rows containing missing values (geom_bar).
#what is the most common variant?
max(variant_freq$value)
[1] 533
variant_freq[variant_freq$value == 533,]
#How many cells lines from men and women are there?
table(PhenoGeno$Sex)
female male
248 279
#Is that equally represented by population? Modify/add to the below code to calculate the % of cell lines from male or female individuals in each population.
PhenoGeno %>%
group_by(Population) %>% add_count(name = "cell lines") %>%
group_by(Population, `cell lines`) %>%
count(Sex, name = "donors") %>%
mutate("%" = donors/`cell lines`)
##Use the below data.frame to answer the following questions.
PhenoGeno.long <- pivot_longer(PhenoGeno[-4], cols = !c(LCL_ID, Sex, Population), names_to = "variant", values_to = "value")
#Which population has the most variants? Describe the data transformations that occur to come to an answer.
PhenoGeno.long[PhenoGeno.long == 2] <- 1
PhenoGeno.long %>%
group_by(Population, LCL_ID) %>%
summarise("variant.sum" = sum(value)) %>%
group_by(Population) %>%
summarise("avg.variant" = mean(variant.sum),
"sd" = sd(variant.sum))
`summarise()` has grouped output by 'Population'. You can override using the `.groups` argument.
#How many unique variants occur in each population? How many occur in 2 of 4 populations?
PhenoGeno.long[PhenoGeno.long == 0] <- NA
na.omit(PhenoGeno.long) %>%
group_by(variant) %>%
count(Population) %>%
pivot_wider(id_cols = Population, names_from = variant, values_from = n) -> variants_by_pop
variants_by_pop[,colSums(is.na(variants_by_pop))==3]
variants_by_pop[,colSums(is.na(variants_by_pop))==2]
Explore other data analysis questions you have! Be careful not to overwrite the PhenoGeno data frame.
##Determine if the phenotype should be used for association testing [why we check for normality]
#add density plot of normal distribution overlap to plots.
ggplot(PhenoGeno, aes(x = Chlamydia_IP10)) + geom_density() +
theme_bw(base_size = 14) ##Data are severely skewed, let's see if we can transform it into something usable
ggplot(PhenoGeno, aes(x = Chlamydia_IP10)) + geom_density() +
theme_bw(base_size = 14) + scale_x_continuous(trans = "log2") ##Data now fit a normal distribution, acceptable for association testing
PhenoGeno$log2_Chlamydia_IP10<-log2(PhenoGeno$Chlamydia_IP10) ##Create a column of the data frame with the log2transformed phenotype
#statistical test for normality
shapiro.test(PhenoGeno$Chlamydia_IP10)
Shapiro-Wilk normality test
data: PhenoGeno$Chlamydia_IP10
W = 0.67467, p-value < 2.2e-16
shapiro.test(PhenoGeno$log2_Chlamydia_IP10)
Shapiro-Wilk normality test
data: PhenoGeno$log2_Chlamydia_IP10
W = 0.98031, p-value = 1.506e-06
#QQ plots: QQ plots compare the distribution of our data against a normal distribution but looking at estimated quantiles.
ggqqplot(PhenoGeno$Chlamydia_IP10)
ggqqplot(PhenoGeno$log2_Chlamydia_IP10)
##Determine if the phenotype is confounded by other variables
ggplot(PhenoGeno, aes(x = Sex, y = log2_Chlamydia_IP10, fill = Sex)) + geom_boxplot(outlier.shape = NA) + geom_jitter(width = 0.2) +
stat_compare_means(method = "t.test") + theme_bw(base_size = 14) + theme(legend.position = "none") ##LCLs from female donors secrete more CXCL10 in response to Chlamydia infection
ggplot(PhenoGeno, aes(x = Population, y = log2_Chlamydia_IP10, fill = Population)) + geom_boxplot(outlier.shape = NA) + geom_jitter(width = 0.2) +
theme_bw(base_size = 14) + stat_compare_means(method = "anova") + theme(legend.position = "none") ##Data may also be confounded by Population effect
#would we interpret this differently if we hadn't normalized?
ggplot(data = PhenoGeno, aes(y = Chlamydia_IP10, x = Population, fill = Sex))+
geom_boxplot(outlier.shape = NA, position = position_dodge2(0.85, preserve = "single")) + geom_point(size = 1, position=position_jitterdodge(0.1)) +
#stat_compare_means(method = "t.test")+
theme_bw(base_size = 12)
#Break here to talk about how association testing works, linear regression, covariates, etc.
Results<-read.xlsx("Data_final.xlsx",2) ##Read the results of association testing into memory
ggplot(Results, aes(x = Pos, y = -log10(Pvalue))) + geom_point() +
theme_bw(base_size = 14) + geom_hline(yintercept = 7.3, color = "red") ## Can pause for discussion of why the threshold for significance is so high. multiple testing burden over 1M independent tests, etc.
##Instruct students to click the “Results” object and sort by “p-value” to find the top-associated SNP.
##Plot the top associated SNP
PhenoGeno$rs2869462<-as.factor(PhenoGeno$rs2869462) ##recode this as factor instead of numeric
ggplot(PhenoGeno, aes(x = rs2869462, y = log2_Chlamydia_IP10, fill = rs2869462)) + geom_boxplot(outlier.shape = NA) + geom_jitter(width = 0.2) +
theme_bw(base_size = 14) + theme(legend.position = "none") ##Individuals with the alternate/minor allele produce less CXCL10/IP10 than those with the reference/major allele
ggplot(PhenoGeno, aes(x = rs2869462, y = log2_Chlamydia_IP10, fill = rs2869462)) + geom_boxplot(outlier.shape = NA) + geom_jitter(width = 0.2) +
theme_bw(base_size = 14) + facet_grid(. ~ Sex) + theme(legend.position = "none") ## Regardless of Sex
ggplot(PhenoGeno, aes(x = rs2869462, y = log2_Chlamydia_IP10, fill = rs2869462)) + geom_boxplot(outlier.shape = NA) + geom_jitter(width = 0.2) +
theme_bw(base_size = 14) + facet_grid(. ~ Population) + theme(legend.position = "none") ## Regardless of Population
Do the same for the 2nd most significant SNPs
##Plot the top associated SNP
PhenoGeno$rs13130917<-as.factor(PhenoGeno$rs13130917) ##recode this as factor instead of numeric
ggplot(PhenoGeno, aes(x = rs13130917, y = log2_Chlamydia_IP10, fill = rs13130917)) + geom_boxplot(outlier.shape = NA) + geom_jitter(width = 0.2) +
theme_bw(base_size = 14) + theme(legend.position = "none") ##Individuals with the alternate/minor allele produce less CXCL10/IP10 than those with the reference/major allele
ggplot(PhenoGeno, aes(x = rs13130917, y = log2_Chlamydia_IP10, fill = rs13130917)) + geom_boxplot(outlier.shape = NA) + geom_jitter(width = 0.2) +
theme_bw(base_size = 14) + facet_grid(. ~ Sex) + theme(legend.position = "none") ## Regardless of Sex
ggplot(PhenoGeno, aes(x = rs13130917, y = log2_Chlamydia_IP10, fill = rs13130917)) + geom_boxplot(outlier.shape = NA) + geom_jitter(width = 0.2) +
theme_bw(base_size = 14) + facet_grid(. ~ Population) + theme(legend.position = "none") ## Regardless of Population
Those graphs look really similar. Do the two alleles primarily co-occur?
PhenoGeno$rs2869462 == PhenoGeno$rs13130917 #it looks like yes
[1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[19] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[37] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[55] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[73] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[91] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[109] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[127] TRUE TRUE TRUE TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[145] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[163] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[181] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[199] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[217] TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[235] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[253] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[271] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[289] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[307] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[325] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[343] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[361] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
[379] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[397] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[415] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[433] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[451] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[469] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[487] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[505] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[523] TRUE TRUE TRUE TRUE TRUE
Where are the 2 highly significant SNPs located in the genome?
Results %>%
filter(SNP == "rs13130917" | SNP == "rs2869462")