7 Mixed models
These exercises extend what was previously introduced in the association tests from regression models to mixed models that account for genetic relatedness among samples.
7.1 Null model
Recall that the first step in our association testing procedure is to fit the null model. In addition to the AnnotatedDataFrame
with phenotype data that we used previously, we will also use the ancestry PCs and pairwise kinship estimates we created in the previous session. We will use the first 4 PCs to adjust for ancestry.
# sample annotation
<- "https://github.com/UW-GAC/SISG_2021/raw/master"
repo_path if (!dir.exists("data")) dir.create("data")
<- "data/sample_phenotype_annotation.RData"
sampfile if (!file.exists(sampfile)) download.file(file.path(repo_path, sampfile), sampfile)
<- get(load(sampfile))
annot library(Biobase)
head(pData(annot))
## sample.id subject.id Population Population.Description sex status age height study
## 1 HG00096 HG00096 GBR British in England and Scotland M 0 47 165.300 study_1
## 2 HG00097 HG00097 GBR British in England and Scotland F 1 47 144.780 study_3
## 3 HG00099 HG00099 GBR British in England and Scotland F 0 40 185.500 study_2
## 4 HG00100 HG00100 GBR British in England and Scotland F 1 45 150.622 study_3
## 5 HG00101 HG00101 GBR British in England and Scotland M 0 40 177.800 study_3
## 6 HG00102 HG00102 GBR British in England and Scotland F 0 49 169.100 study_1
# load the ancestry PCs
<- "data/pcs.RData"
pcfile if (!file.exists(pcfile)) download.file(file.path(repo_path, pcfile), pcfile)
<- get(load(pcfile))
pcs <- pcs[,c("sample.id", "PC1", "PC2", "PC3", "PC4")]
pcs head(pcs)
## sample.id PC1 PC2 PC3 PC4
## HG00096 HG00096 -0.02189121 -0.03637024 -0.01328390 0.005533490
## HG00097 HG00097 -0.02002924 -0.03285711 -0.01207786 0.005390183
## HG00099 HG00099 -0.02088688 -0.03330825 -0.01510526 0.004499776
## HG00100 HG00100 -0.02031270 -0.03975678 -0.01908062 0.004291604
## HG00101 HG00101 -0.02038482 -0.03437442 -0.01328870 0.002310498
## HG00102 HG00102 -0.02144725 -0.03970277 -0.01391896 -0.001245155
# merge PCs with the sample annotation
library(dplyr)
<- left_join(pData(annot), pcs, by="sample.id")
dat # update the AnnotatedDataFrame
pData(annot) <- dat
save(annot, file="data/sample_phenotype_pcs.RData")
We can create a kinship matrix from the output of pcrelate
. We multiply the kinship values by 2 to get values on the same scale as the standard GRM. This matrix is represented in R as a symmetric matrix object from the Matrix package.
<- "data/pcrelate_kinship.RData"
kinfile if (!file.exists(kinfile)) download.file(file.path(repo_path, kinfile), kinfile)
<- get(load(kinfile))
pcrel library(GENESIS)
<- pcrelateToMatrix(pcrel, scaleKin=2, verbose=FALSE)
kinship dim(kinship)
## [1] 1126 1126
1:5,1:5] kinship[
## 5 x 5 Matrix of class "dsyMatrix"
## HG00096 HG00097 HG00099 HG00100 HG00101
## HG00096 0.9906231740 0.036922756 0.010409075 0.0005780419 0.016522827
## HG00097 0.0369227560 1.001560563 -0.000653982 0.0035158297 0.007678616
## HG00099 0.0104090751 -0.000653982 0.937748649 0.0556213201 -0.003235074
## HG00100 0.0005780419 0.003515830 0.055621320 1.0111688406 0.017959707
## HG00101 0.0165228269 0.007678616 -0.003235074 0.0179597069 0.965028548
When running a mixed model analysis, we still fit the null model using the fitNullModel
function in GENESIS. Now, we include the kinship matrix in the model with the cov.mat
argument, which is used to specify the random effect(s) in the model with covariance structure(s) proportional to the supplied matrix(s). The inclusion of these random effects is what makes this a mixed model, rather than a simple regression model. We also add the ancestry PCs to the list of covariates and allow for heterogeneous residual variance by study
with the group.var
argument, as before.
<- fitNullModel(annot,
nullmod outcome="height",
covars=c("sex", "age", "study", paste0("PC", 1:4)),
cov.mat=kinship,
group.var="study",
verbose=FALSE)
save(nullmod, file="data/null_mixed_model.RData")
We can investigate the output from fitNullModel
.
# description of the model we fit
$model nullmod
## $hetResid
## [1] TRUE
##
## $family
##
## Family: gaussian
## Link function: identity
##
##
## $outcome
## [1] "height"
##
## $covars
## [1] "sex" "age" "study" "PC1" "PC2" "PC3" "PC4"
##
## $formula
## [1] "height ~ sex + age + study + PC1 + PC2 + PC3 + PC4 + (1|A) + var(study)"
# fixed effect regression estimates
$fixef nullmod
## Est SE Stat pval
## (Intercept) 163.74431361 3.15130038 2.699932e+03 0.000000e+00
## sexM 6.27072759 0.67911122 8.526171e+01 2.613754e-20
## age 0.07420444 0.06839188 1.177201e+00 2.779265e-01
## studystudy_2 10.54243352 0.81122848 1.688867e+02 1.295192e-38
## studystudy_3 -8.95546858 0.83932675 1.138451e+02 1.408933e-26
## PC1 0.05117574 11.25310400 2.068158e-05 9.963715e-01
## PC2 -61.24485338 11.22450534 2.977178e+01 4.860157e-08
## PC3 -8.81351399 11.09566802 6.309447e-01 4.270090e-01
## PC4 -5.61519972 11.19652394 2.515151e-01 6.160103e-01
# variance component estimates by group.var
$varComp nullmod
## V_A V_study_1 V_study_3 V_study_2
## 0.00000 93.61147 168.76021 152.37404
# model fit: fitted values, residuals
head(nullmod$fit)
## outcome workingY fitted.values resid.marginal resid.conditional linear.predictor resid.PY resid.cholesky sample.id
## HG00096 165.300 165.300 175.8150 -10.5150254 -10.5150254 175.8150 -0.112326247 -1.08679039 HG00096
## HG00097 144.780 144.780 160.3639 -15.5839388 -15.5839388 160.3639 -0.092343679 -1.19961587 HG00097
## HG00099 185.500 185.500 179.4017 6.0983220 6.0983220 179.4017 0.040022053 0.49403175 HG00099
## HG00100 150.622 150.622 160.7060 -10.0839725 -10.0839725 160.7060 -0.059753258 -0.77624108 HG00100
## HG00101 177.800 177.800 166.2361 11.5638906 11.5638906 166.2361 0.068522613 0.89016178 HG00101
## HG00102 169.100 169.100 169.9405 -0.8404905 -0.8404905 169.9405 -0.008978499 -0.08686969 HG00102
library(ggplot2)
ggplot(nullmod$fit, aes(x = fitted.values, y = resid.marginal)) +
geom_point(alpha = 0.5) +
geom_hline(yintercept = 0) +
geom_smooth(method = 'lm')
7.2 Single-variant tests
Now we can run a single-variant test, accounting for genetic ancestry and genetic relatedness among the subjects. We use the same assocTestSingle
function as before; the only difference is that we pass in our new null model.
library(SeqVarTools)
<- "data/1KG_phase3_subset_chr1.gds"
gdsfile if (!file.exists(gdsfile)) download.file(file.path(repo_path, gdsfile), gdsfile)
::showfile.gds(closeall=TRUE) # make sure file is not already open
gdsfmt<- seqOpen(gdsfile)
gds
# make the seqVarData object
<- SeqVarData(gds, sampleData=annot)
seqData
# make the iterator object
<- SeqVarBlockIterator(seqData, verbose=FALSE)
iterator
# run the single-variant association test
<- assocTestSingle(iterator, nullmod) assoc
## # of selected samples: 1,126
dim(assoc)
## [1] 1129 14
head(assoc)
## variant.id chr pos allele.index n.obs freq MAC Score Score.SE Score.Stat Score.pval Est Est.SE PVE
## 1 1 1 970546 1 1126 0.0039964476 9 -0.17384025 0.2592275 -0.6706088 0.502469798 -2.586950 3.857615 0.0004026103
## 2 2 1 985900 1 1126 0.0492895204 111 -1.55598468 0.8183596 -1.9013460 0.057256708 -2.323363 1.221957 0.0032364488
## 3 3 1 1025045 1 1126 0.0004440497 1 -0.29433530 0.1029724 -2.8583911 0.004257953 -27.758817 9.711343 0.0073145856
## 4 4 1 1265550 1 1126 0.0008880995 2 -0.09940909 0.1078116 -0.9220633 0.356495581 -8.552545 9.275442 0.0007611458
## 5 5 1 1472676 1 1126 0.0071047957 16 0.14878210 0.3468438 0.4289599 0.667952399 1.236752 2.883142 0.0001647327
## 6 6 1 1735725 1 1126 0.0022202487 5 -0.13567249 0.1982518 -0.6843443 0.493757749 -3.451895 5.044091 0.0004192719
We make the usual QQ plot, filtering to variants with minor allele count (MAC) >= 5.
library(ggplot2)
<- function(pval) {
qqPlot <- pval[!is.na(pval)]
pval <- length(pval)
n <- 1:n
x <- data.frame(obs=sort(pval),
dat exp=x/n,
upper=qbeta(0.025, x, rev(x)),
lower=qbeta(0.975, x, rev(x)))
ggplot(dat, aes(-log10(exp), -log10(obs))) +
geom_line(aes(-log10(exp), -log10(upper)), color="gray") +
geom_line(aes(-log10(exp), -log10(lower)), color="gray") +
geom_point() +
geom_abline(intercept=0, slope=1, color="red") +
xlab(expression(paste(-log[10], "(expected P)"))) +
ylab(expression(paste(-log[10], "(observed P)"))) +
theme_bw()
}
qqPlot(assoc$Score.pval[assoc$MAC >= 5])
Notice that we observe much less inflation than before, when we did not adjust for ancestry and relatedness.
7.3 Exercise
- Perform a single-variant association test for
status
. Adjust for sex, age, study, ancestry, and kinship in the model. Don’t forget to consider thefamily
parameter. Make a QQ plot of the p-values for all variants with MAC >= 5.
7.4 Solution
- Perform a single-variant association test for
status
. Adjust for sex, age, study, ancestry, and kinship in the model. Don’t forget to consider thefamily
parameter. Make a QQ plot of the p-values for all variants with MAC >= 5.
The first step is to fit the null model. We include the first 4 PCs as covariates in our model to adjust for ancestry, and we include a random effect proportional to the kinship matrix to adjust for genetic relatedness. Recall that with a binary outcome, we set family = binomial
– because we have a random effect, this will fit an approximate logistic mixed model using the GMMAT method.
<- fitNullModel(annot,
nullmod.status outcome="status",
covars=c("sex", "age", "study", paste0("PC", 1:4)),
cov.mat=kinship,
family = binomial,
verbose=FALSE)
# description of the model we fit
$model nullmod.status
## $hetResid
## [1] FALSE
##
## $family
##
## Family: binomial
## Link function: logit
##
##
## $outcome
## [1] "status"
##
## $covars
## [1] "sex" "age" "study" "PC1" "PC2" "PC3" "PC4"
##
## $formula
## [1] "status ~ sex + age + study + PC1 + PC2 + PC3 + PC4 + (1|A)"
# fixed effect regression estimates
$fixef nullmod.status
## Est SE Stat pval
## (Intercept) -3.182182913 0.96998026 1.076278e+01 0.001035617
## sexM 0.124463598 0.20598679 3.650951e-01 0.545690084
## age 0.019985093 0.02075004 9.276289e-01 0.335479807
## studystudy_2 0.003063956 0.24852285 1.519961e-04 0.990163394
## studystudy_3 -0.145204949 0.25448616 3.255626e-01 0.568283747
## PC1 -3.074068322 3.46894981 7.852917e-01 0.375527625
## PC2 5.076675586 3.21042684 2.500537e+00 0.113807493
## PC3 -0.236916365 3.45512128 4.701793e-03 0.945332199
## PC4 1.946715491 3.41350477 3.252400e-01 0.568475488
# variance component estimates by group.var
$varComp nullmod.status
## V_A
## 0.2923299
Now that we have the null model, we perform the single-variant association tests and make the QQ plot the same way as before.
resetIterator(iterator, verbose=FALSE)
# run the single-variant association test
<- assocTestSingle(iterator, nullmod.status, test="Score") assoc.status
## # of selected samples: 1,126
dim(assoc.status)
## [1] 1129 14
head(assoc.status)
## variant.id chr pos allele.index n.obs freq MAC Score Score.SE Score.Stat Score.pval Est Est.SE PVE
## 1 1 1 970546 1 1126 0.0039964476 9 0.1975748 0.8349756 0.2366235 0.81294892 0.2833897 1.1976398 6.097484e-05
## 2 2 1 985900 1 1126 0.0492895204 111 -2.5741449 2.6188824 -0.9829173 0.32564813 -0.3753194 0.3818423 1.052129e-03
## 3 3 1 1025045 1 1126 0.0004440497 1 -0.0721219 0.2555958 -0.2821717 0.77781189 -1.1039760 3.9124270 8.670854e-05
## 4 4 1 1265550 1 1126 0.0008880995 2 0.7862901 0.4237118 1.8557191 0.06349361 4.3796725 2.3600946 3.750244e-03
## 5 5 1 1472676 1 1126 0.0071047957 16 0.6300877 1.0837470 0.5813974 0.56097266 0.5364696 0.9227246 3.681131e-04
## 6 6 1 1735725 1 1126 0.0022202487 5 -0.4029747 0.5941659 -0.6782191 0.49763280 -1.1414641 1.6830315 5.009278e-04
qqPlot(assoc.status$Score.pval[assoc.status$MAC >= 5])
seqClose(seqData)