9 Mixed models

These exercises introduce relatedness to association testing with mixed models.

9.1 Null model

The first step in an association test is to fit the null model. In addition to the AnnotatedDataFrame with phenotypes we used previously, we will need the principal components and kinship. We will use the first five PCs to adjust for ancestry.

# sample annotation
repo_path <- "https://github.com/UW-GAC/SISG_2020/raw/master"
if (!dir.exists("data")) dir.create("data")
sampfile <- "data/sample_phenotype_annotation.RData"
if (!file.exists(sampfile)) download.file(file.path(repo_path, sampfile), sampfile)
annot <- TopmedPipeline::getobj(sampfile)
library(Biobase)
head(pData(annot))

##   sample.id subject.id Population          Population.Description sex status
## 1   HG00096    HG00096        GBR British in England and Scotland   M      0
## 2   HG00097    HG00097        GBR British in England and Scotland   F      1
## 3   HG00099    HG00099        GBR British in England and Scotland   F      0
## 4   HG00100    HG00100        GBR British in England and Scotland   F      1
## 5   HG00101    HG00101        GBR British in England and Scotland   M      0
## 6   HG00102    HG00102        GBR British in England and Scotland   F      0
##   age  height   study
## 1  47 165.300 study_1
## 2  47 144.780 study_3
## 3  40 185.500 study_2
## 4  45 150.622 study_3
## 5  40 177.800 study_3
## 6  49 169.100 study_1

# load the PCs
pcfile <- "data/pcs.RData"
if (!file.exists(pcfile)) download.file(file.path(repo_path, pcfile), pcfile)
pcs <- TopmedPipeline::getobj(pcfile)
pcs <- pcs[,c("sample.id", "PC1", "PC2", "PC3", "PC4", "PC5")]
head(pcs)

##   sample.id         PC1         PC2          PC3          PC4         PC5
## 1   HG00096 -0.02098435 -0.03716014 -0.007539234 -0.004984352 -0.03920777
## 2   HG00097 -0.01929295 -0.03289496 -0.009176117 -0.005328914 -0.03297778
## 3   HG00099 -0.02042444 -0.03371227 -0.010983795 -0.004856350 -0.03208595
## 4   HG00100 -0.01970348 -0.03978044 -0.013302258 -0.004340841 -0.04208343
## 5   HG00101 -0.01959563 -0.03431033 -0.008571074 -0.002220712 -0.03260015
## 6   HG00102 -0.02041573 -0.03941142 -0.010696762  0.001506639 -0.02913023

# add PCs to the sample annotation
dat <- left_join(pData(annot), pcs, by="sample.id")
pData(annot) <- dat
save(annot, file="data/sample_phenotype_pcs.RData")

We create a kinship matrix from the output of pcrelate. We multiply the kinship values by 2 to get values equivalent to a GRM. This matrix is represented in R as a symmetric matrix object from the Matrix package.

kinfile <- "data/pcrelate_kinship.RData"
if (!file.exists(kinfile)) download.file(file.path(repo_path, kinfile), kinfile)
pcrel <- TopmedPipeline::getobj(kinfile)
library(GENESIS)
kinship <- pcrelateToMatrix(pcrel, scaleKin=2, verbose=FALSE)
dim(kinship)

## [1] 100 100

kinship[1:5,1:5]

## 5 x 5 Matrix of class "dsyMatrix"
##              HG00110     HG00116     HG00120      HG00128      HG00136
## HG00110  1.046618916  0.01896886 -0.01635266 -0.006769183 -0.022907033
## HG00116  0.018968859  0.92512545  0.19178400  0.013529261 -0.018774460
## HG00120 -0.016352656  0.19178400  0.96012109 -0.014849711 -0.046482113
## HG00128 -0.006769183  0.01352926 -0.01484971  0.914741469 -0.003540016
## HG00136 -0.022907033 -0.01877446 -0.04648211 -0.003540016  1.007541773

We fit the null model, adding the PCs to the list of covariates, and specifying the kinship as the covariance matrix with the cov.mat argument. As before, we use study as a grouping variable.

nullmod <- fitNullModel(annot, outcome="height", 
                        covars=c("sex", "age", "study", paste0("PC", 1:5)), 
                        cov.mat=kinship, group.var="study", verbose=FALSE)
save(nullmod, file="data/null_mixed_model.RData")

9.2 Single-variant tests

Now we can run a single-variant test, accounting for relatedness between the subjects.

library(SeqVarTools)
gdsfile <- "data/1KG_phase3_subset_chr1.gds"
if (!file.exists(gdsfile)) download.file(file.path(repo_path, gdsfile), gdsfile)
gdsfmt::showfile.gds(closeall=TRUE) # make sure file is not already open
gds <- seqOpen(gdsfile)
seqData <- SeqVarData(gds, sampleData=annot)
iterator <- SeqVarBlockIterator(seqData, verbose=FALSE)
assoc <- assocTestSingle(iterator, nullmod)

## # of selected samples: 100

head(assoc)

##    variant.id chr     pos allele.index n.obs  freq MAC       Score   Score.SE
## 1           1   1  970546            1   100 0.015   3 -0.08539608 0.14196049
## 2           2   1  985900            1   100 0.045   9 -0.22241847 0.23151042
## 5           5   1 1472676            1   100 0.005   1 -0.08038066 0.08682407
## 7           7   1 2185887            1   100 0.005   1  0.02490820 0.07345671
## 9           9   1 2629401            1   100 0.025   5  0.16640336 0.18372179
## 10         10   1 2710895            1   100 0.060  12  0.09145582 0.27222044
##    Score.Stat Score.pval        Est    Est.SE         PVE
## 1  -0.6015483  0.5474749  -4.237435  7.044214 0.004020670
## 2  -0.9607277  0.3366891  -4.149825  4.319460 0.010255530
## 5  -0.9257877  0.3545563 -10.662800 11.517543 0.009523144
## 7   0.3390868  0.7345443   4.616145 13.613460 0.001277554
## 9   0.9057356  0.3650758   4.929930  5.443013 0.009115077
## 10  0.3359624  0.7368992   1.234156  3.673493 0.001254119

qqPlot(assoc$Score.pval)

9.3 Exercise

Run a sliding window test using the mixed model and make a QQ plot.