5 Association tests - Part II
These exercises continue the introduction to genetic association testing. Here, we introduce multiple-variant association tests, which are commonly used for testing rare variants in aggregate.
5.1 Sliding window tests
We can perform burden, SKAT, SKAT-O, fastSKAT, and SMMAT tests using the GENESIS function assocTestAggregate
. First, we need to load the null model and AnnotatedDataFrame
(sample annotation + phenotype data) that we created in the previous session, and we need to create our SeqVarData
object linking the GDS file to the AnnotatedDataFrame
.
<- "https://github.com/UW-GAC/SISG_2021/raw/master"
repo_path if (!dir.exists("data")) dir.create("data")
# load our null model
<- "data/null_model.RData"
nullmodfile if (!file.exists(nullmodfile)) download.file(file.path(repo_path, nullmodfile), nullmodfile)
<- get(load(nullmodfile))
nullmod
# load our sample annotation
<- "data/sample_phenotype_annotation.RData"
annotfile if (!file.exists(annotfile)) download.file(file.path(repo_path, annotfile), annotfile)
<- get(load(annotfile))
annot
# open the GDS file
library(SeqVarTools)
<- "data/1KG_phase3_subset_chr1.gds"
gdsfile if (!file.exists(gdsfile)) download.file(file.path(repo_path, gdsfile), gdsfile)
::showfile.gds(closeall=TRUE) # make sure file is not already open
gdsfmt<- seqOpen(gdsfile)
gds
# make the seqVarData object
<- SeqVarData(gds, sampleData=annot) seqData
5.1.1 Burden test
First, we perform a burden test. We restrict the test to variants with alternate allele frequency < 0.1. (For real data, this threshold would be lower, perhaps 0.05 or 0.01.) We use a flat weighting scheme – i.e. every variant gets the same weight. We define a sliding window across the genome using a SeqVarWindowIterator
object.
# make the window iterator object
<- SeqVarWindowIterator(seqData, windowSize=10000, windowShift=5000, verbose=FALSE)
iterator
# run the burden test
library(GENESIS)
<- assocTestAggregate(iterator,
assoc
nullmod, test="Burden",
AF.max=0.1,
weight.beta=c(1,1),
verbose = FALSE)
The function returns the primary results for each window in one table. It also returns a list of tables that contain the variant details for each window tested.
names(assoc)
## [1] "results" "variantInfo"
# results for each window
head(assoc$results)
## chr start end n.site n.alt n.sample.alt Score Score.SE Score.Stat Score.pval Est Est.SE PVE
## 1 1 965001 975000 1 9 9 -0.1191236 0.2577712 -0.4621292 0.643988693 -1.792788 3.879410 0.0001905115
## 2 1 980001 990000 1 111 107 -1.6707553 0.8841849 -1.8895996 0.058811535 -2.137109 1.130985 0.0031851797
## 3 1 1020001 1030000 1 1 1 -0.2795838 0.1007173 -2.7759261 0.005504472 -27.561563 9.928781 0.0068740102
## 4 1 1260001 1270000 1 2 2 -0.1105487 0.1085480 -1.0184319 0.308472744 -9.382319 9.212515 0.0009252485
## 5 1 1465001 1475000 1 16 16 0.3630992 0.3456555 1.0504657 0.293504065 3.039054 2.893054 0.0009843694
## 6 1 1730001 1740000 1 5 5 -0.1300405 0.1973175 -0.6590420 0.509868790 -3.340007 5.067973 0.0003874544
# how many variants in each window?
table(assoc$results$n.site)
##
## 0 1 2
## 143 963 16
# variant details for windows with > 1 variant
<- which(assoc$results$n.site > 1)
idx head(assoc$variantInfo[idx])
## [[1]]
## variant.id chr pos allele.index n.obs freq MAC weight
## 1 147 1 27443183 1 1126 0.0013321492 3 1
## 2 148 1 27448645 1 1126 0.0004440497 1 1
##
## [[2]]
## variant.id chr pos allele.index n.obs freq MAC weight
## 1 207 1 41346174 1 1126 0.0004440497 1 1
## 2 208 1 41352776 1 1126 0.0017761989 4 1
##
## [[3]]
## variant.id chr pos allele.index n.obs freq MAC weight
## 1 246 1 51376035 1 1126 0.0004440497 1 1
## 2 247 1 51381953 1 1126 0.0013321492 3 1
##
## [[4]]
## variant.id chr pos allele.index n.obs freq MAC weight
## 1 279 1 57342828 1 1126 0.0062166963 14 1
## 2 280 1 57344100 1 1126 0.0004440497 1 1
##
## [[5]]
## variant.id chr pos allele.index n.obs freq MAC weight
## 1 293 1 59912193 1 1126 0.001776199 4 1
## 2 294 1 59917976 1 1126 0.003108348 7 1
##
## [[6]]
## variant.id chr pos allele.index n.obs freq MAC weight
## 1 356 1 73050313 1 1126 0.0368561279 83 1
## 2 357 1 73050352 1 1126 0.0008880995 2 1
We can make a QQ plot of the burden p-values from the main results table.
library(ggplot2)
<- function(pval) {
qqPlot <- pval[!is.na(pval)]
pval <- length(pval)
n <- 1:n
x <- data.frame(obs=sort(pval),
dat exp=x/n,
upper=qbeta(0.025, x, rev(x)),
lower=qbeta(0.975, x, rev(x)))
ggplot(dat, aes(-log10(exp), -log10(obs))) +
geom_line(aes(-log10(exp), -log10(upper)), color="gray") +
geom_line(aes(-log10(exp), -log10(lower)), color="gray") +
geom_point() +
geom_abline(intercept=0, slope=1, color="red") +
xlab(expression(paste(-log[10], "(expected P)"))) +
ylab(expression(paste(-log[10], "(observed P)"))) +
theme_bw()
}
# make a QQ plot of the burden test p-values
qqPlot(assoc$results$Score.pval)
5.1.2 SKAT test
We can also perform a SKAT test. This time, we will use the Wu weights, which give larger weights to rarer variants.
# reset the iterator to the first window
resetIterator(iterator)
## # of selected variants: 1
# run the SKAT test
<- assocTestAggregate(iterator,
assoc
nullmod, test="SKAT",
AF.max=0.1,
weight.beta=c(1,25),
verbose = FALSE)
# results for each window
head(assoc$results)
## chr start end n.site n.alt n.sample.alt Q pval err pval.method
## 1 1 965001 975000 1 9 9 7.318095 0.643988693 0 integration
## 2 1 980001 990000 1 111 107 154.178289 0.058811535 0 integration
## 3 1 1020001 1030000 1 1 1 47.823918 0.005504472 0 integration
## 4 1 1260001 1270000 1 2 2 7.319239 0.308472744 0 integration
## 5 1 1465001 1475000 1 16 16 58.518665 0.293504065 0 integration
## 6 1 1730001 1740000 1 5 5 9.499539 0.509868790 0 integration
# variant details for windows with > 1 variant
<- which(assoc$results$n.site > 1)
idx head(assoc$variantInfo[idx])
## [[1]]
## variant.id chr pos allele.index n.obs freq MAC weight
## 1 147 1 27443183 1 1126 0.0013321492 3 24.21284
## 2 148 1 27448645 1 1126 0.0004440497 1 24.73493
##
## [[2]]
## variant.id chr pos allele.index n.obs freq MAC weight
## 1 207 1 41346174 1 1126 0.0004440497 1 24.73493
## 2 208 1 41352776 1 1126 0.0017761989 4 23.95577
##
## [[3]]
## variant.id chr pos allele.index n.obs freq MAC weight
## 1 246 1 51376035 1 1126 0.0004440497 1 24.73493
## 2 247 1 51381953 1 1126 0.0013321492 3 24.21284
##
## [[4]]
## variant.id chr pos allele.index n.obs freq MAC weight
## 1 279 1 57342828 1 1126 0.0062166963 14 21.52488
## 2 280 1 57344100 1 1126 0.0004440497 1 24.73493
##
## [[5]]
## variant.id chr pos allele.index n.obs freq MAC weight
## 1 293 1 59912193 1 1126 0.001776199 4 23.95577
## 2 294 1 59917976 1 1126 0.003108348 7 23.20016
##
## [[6]]
## variant.id chr pos allele.index n.obs freq MAC weight
## 1 356 1 73050313 1 1126 0.0368561279 83 10.15145
## 2 357 1 73050352 1 1126 0.0008880995 2 24.47255
# make a QQ plot of the SKAT test p-values
qqPlot(assoc$results$pval)
5.2 Exercise
- Perform a sliding window SKAT test for the outcome status. Adjust your model for the covariates sex and study. When performing your SKAT test, use all variants with alternate allele frequency < 20%, and use the Wu weights to give larger weights to rarer variants. Use the same
windowSize
andwindowShift
as in the examples. How many windows have >1 variant? Make a QQ plot of the SKAT p-values.
5.3 Solution
- Perform a sliding window SKAT test for the outcome status. Adjust your model for the covariates sex and study. When performing your SKAT test, use all variants with alternate allele frequency < 20%, and use the Wu weights to give larger weights to rarer variants. Use the same
windowSize
andwindowShift
as in the examples. How many windows have >1 variant? Make a QQ plot of the SKAT p-values.
The first step is to fit our null model – since our outcome, status, is a binary variable, we must fit a logistic regression null model using the family = binomial
argument. The second step is to create our SeqVarWindowIterator
object. The third step is to perform the SKAT test using assocTestAggregate
– we can set the maximum alternate allele frequency with the AF.max
argument, and we can set the variant weights with the weight.beta
argument.
<- fitNullModel(annot,
nullmod.status outcome="status",
covars=c("sex", "study"),
family=binomial,
verbose=FALSE)
seqResetFilter(seqData, verbose=FALSE)
<- SeqVarWindowIterator(seqData, windowSize=10000, windowShift=5000, verbose=FALSE)
iterator
<- assocTestAggregate(iterator,
assoc
nullmod, test="SKAT",
AF.max=0.2,
weight.beta=c(1,25),
verbose = FALSE)
# results for each window
head(assoc$results)
## chr start end n.site n.alt n.sample.alt Q pval err pval.method
## 1 1 965001 975000 1 9 9 7.318095 0.643988693 0 integration
## 2 1 980001 990000 1 111 107 154.178289 0.058811535 0 integration
## 3 1 1020001 1030000 1 1 1 47.823918 0.005504472 0 integration
## 4 1 1260001 1270000 1 2 2 7.319239 0.308472744 0 integration
## 5 1 1465001 1475000 1 16 16 58.518665 0.293504065 0 integration
## 6 1 1730001 1740000 1 5 5 9.499539 0.509868790 0 integration
# how many variants in each window?
table(assoc$results$n.site)
##
## 0 1 2
## 95 1007 20
# variant details for windows with > 1 variant
<- which(assoc$results$n.site > 1)
idx head(assoc$variantInfo[idx])
## [[1]]
## variant.id chr pos allele.index n.obs freq MAC weight
## 1 147 1 27443183 1 1126 0.0013321492 3 24.21284
## 2 148 1 27448645 1 1126 0.0004440497 1 24.73493
##
## [[2]]
## variant.id chr pos allele.index n.obs freq MAC weight
## 1 207 1 41346174 1 1126 0.0004440497 1 24.73493
## 2 208 1 41352776 1 1126 0.0017761989 4 23.95577
##
## [[3]]
## variant.id chr pos allele.index n.obs freq MAC weight
## 1 246 1 51376035 1 1126 0.0004440497 1 24.73493
## 2 247 1 51381953 1 1126 0.0013321492 3 24.21284
##
## [[4]]
## variant.id chr pos allele.index n.obs freq MAC weight
## 1 279 1 57342828 1 1126 0.0062166963 14 21.52488
## 2 280 1 57344100 1 1126 0.0004440497 1 24.73493
##
## [[5]]
## variant.id chr pos allele.index n.obs freq MAC weight
## 1 293 1 59912193 1 1126 0.001776199 4 23.95577
## 2 294 1 59917976 1 1126 0.003108348 7 23.20016
##
## [[6]]
## variant.id chr pos allele.index n.obs freq MAC weight
## 1 356 1 73050313 1 1126 0.0368561279 83 10.15145
## 2 357 1 73050352 1 1126 0.0008880995 2 24.47255
# make a QQ plot of the SKAT test p-values
qqPlot(assoc$results$pval)
seqClose(seqData)