8 PC-Relate

To disentangle ancestry from recent familial relatedness, we use the PC-Relate method.

8.1 KING

Step 1 is to get initial estimates of kinship using KING, which is robust to population structure but not admixture. The KING algorithm is available in SNPRelate. We select a subset of variants for this calculation with LD pruning.

# use a GDS file with all chromosomes
library(SeqArray)
gdsfile <- "data/1KG_phase3_subset.gds"
if (!file.exists(gdsfile)) download.file(file.path(repo_path, gdsfile), gdsfile)
gdsfmt::showfile.gds(closeall=TRUE) # make sure file is not already open
gds <- seqOpen(gdsfile)

# use a subset of 100 samples to make things run faster
sampfile <- "data/samples_subset100.RData"
if (!file.exists(sampfile)) download.file(file.path(repo_path, sampfile), sampfile)
sample.id <- TopmedPipeline::getobj(sampfile)

# LD pruning to get variant set
library(SNPRelate)
set.seed(100) # LD pruning has a random element; so make this reproducible
snpset <- snpgdsLDpruning(gds, sample.id=sample.id, method="corr", 
                          slide.max.bp=10e6, ld.threshold=sqrt(0.1))

## SNV pruning based on LD:
## Excluding 1,120 SNVs on non-autosomes
## Calculating allele counts/frequencies ...
## 
[..................................................]  0%, ETC: ---    
[==================================================] 100%, completed, 0s
## Excluding 13,673 SNVs (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## Working space: 100 samples, 10,967 SNVs
##     using 1 (CPU) core
##     sliding window: 10,000,000 basepairs, Inf SNPs
##     |LD| threshold: 0.316228
##     method: correlation
## Chromosome 1: 31.34%, 351/1,120
## Chromosome 2: 31.43%, 352/1,120
## Chromosome 3: 30.98%, 347/1,120
## Chromosome 4: 31.16%, 349/1,120
## Chromosome 5: 29.64%, 332/1,120
## Chromosome 6: 31.43%, 352/1,120
## Chromosome 7: 28.66%, 321/1,120
## Chromosome 8: 25.62%, 287/1,120
## Chromosome 9: 27.32%, 306/1,120
## Chromosome 10: 28.57%, 320/1,120
## Chromosome 11: 26.79%, 300/1,120
## Chromosome 12: 28.66%, 321/1,120
## Chromosome 13: 25.54%, 286/1,120
## Chromosome 14: 24.29%, 272/1,120
## Chromosome 15: 22.59%, 253/1,120
## Chromosome 16: 21.88%, 245/1,120
## Chromosome 17: 21.79%, 244/1,120
## Chromosome 18: 23.57%, 264/1,120
## Chromosome 19: 21.25%, 238/1,120
## Chromosome 20: 20.00%, 224/1,120
## Chromosome 21: 17.50%, 196/1,120
## Chromosome 22: 17.50%, 196/1,120
## 6,356 markers are selected in total.

sapply(snpset, length)

##  chr1  chr2  chr3  chr4  chr5  chr6  chr7  chr8  chr9 chr10 chr11 chr12 chr13 
##   351   352   347   349   332   352   321   287   306   320   300   321   286 
## chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 
##   272   253   245   244   264   238   224   196   196

pruned <- unlist(snpset, use.names=FALSE)

# KING
king <- snpgdsIBDKING(gds, sample.id=sample.id, snp.id=pruned)

## IBD analysis (KING method of moment) on genotypes:
## Calculating allele counts/frequencies ...
## 
[..................................................]  0%, ETC: ---    
[==================================================] 100%, completed, 0s
## Working space: 100 samples, 6,356 SNVs
##     using 1 (CPU) core
## No family is specified, and all individuals are treated as singletons.
## Relationship inference in the presence of population stratification.
## CPU capabilities: Double-Precision SSE2
## Mon Jul 27 15:04:47 2020    (internal increment: 65536)
## 
[..................................................]  0%, ETC: ---        
[==================================================] 100%, completed, 0s
## Mon Jul 27 15:04:47 2020    Done.

names(king)

## [1] "sample.id" "snp.id"    "afreq"     "IBS0"      "kinship"

dim(king$kinship)

## [1] 100 100

kingMat <- king$kinship
colnames(kingMat) <- rownames(kingMat) <- king$sample.id

We extract pairwise kinship estimates and IBS0 to plot.

kinship <- snpgdsIBDSelection(king)
head(kinship)

##       ID1     ID2       IBS0     kinship
## 1 HG00110 HG00116 0.02564506 -0.01271186
## 2 HG00110 HG00120 0.02721838 -0.02814770
## 3 HG00110 HG00128 0.02454374 -0.01150121
## 4 HG00110 HG00136 0.02926369 -0.04388620
## 5 HG00110 HG00137 0.02737571 -0.03510896
## 6 HG00110 HG00141 0.02769037 -0.03843826

library(ggplot2)
ggplot(kinship, aes(IBS0, kinship)) +
    geom_hline(yintercept=2^(-seq(3,9,2)/2), linetype="dashed", color="grey") +
    geom_point(alpha=0.5) +
    ylab("kinship estimate") +
    theme_bw()

8.2 PC-AiR

The next step is PC-AiR, in which we select a set of unrelated samples that is maximally informative about all ancestries in the sample. We use this unrelated set for Principal Component Analysis (PCA), then project the relatives onto the PCs.

First, we partition the samples into a related and unrelated set. We use a kinship threshold of degree 3, which corresponds to first cousins. This defines anyone less related than first cousins as “unrelated”. We load the GENESIS package. In the first iteration, we use the KING estimates for both kinship (kinMat) and ancestry divergence (divMat). KING kinship estimates are negative for samples with different ancestry.

library(GENESIS)
sampset <- pcairPartition(kinobj=kingMat, kin.thresh=2^(-9/2),
                          divobj=kingMat, div.thresh=-2^(-9/2))
names(sampset)

## [1] "rels"   "unrels"

sapply(sampset, length)

##   rels unrels 
##     15     85

Using the SNPRelate package, we run PCA on the unrelated set and project values for the related set. We use the same LD pruned set of variants again.

# run PCA on unrelated set
pca.unrel <- snpgdsPCA(gds, sample.id=sampset$unrels, snp.id=pruned)

## Principal Component Analysis (PCA) on genotypes:
## Calculating allele counts/frequencies ...
## 
[..................................................]  0%, ETC: ---    
[==================================================] 100%, completed, 0s
## Excluding 223 SNVs (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## Working space: 85 samples, 6,133 SNVs
##     using 1 (CPU) core
## CPU capabilities: Double-Precision SSE2
## Mon Jul 27 15:04:48 2020    (internal increment: 8816)
## 
[..................................................]  0%, ETC: ---        
[==================================================] 100%, completed, 0s
## Mon Jul 27 15:04:48 2020    Begin (eigenvalues and eigenvectors)
## Mon Jul 27 15:04:48 2020    Done.

# project values for relatives
snp.load <- snpgdsPCASNPLoading(pca.unrel, gdsobj=gds)

## SNP loading:
## Working space: 85 samples, 6133 SNPs
##     using 1 (CPU) core
##     using the top 32 eigenvectors
## Mon Jul 27 15:04:48 2020    (internal increment: 65536)
## 
[..................................................]  0%, ETC: ---        
[==================================================] 100%, completed, 0s
## Mon Jul 27 15:04:48 2020    Done.

samp.load <- snpgdsPCASampLoading(snp.load, gdsobj=gds, sample.id=sampset$rels)

## Sample loading:
## Working space: 15 samples, 6133 SNPs
##     using 1 (CPU) core
##     using the top 32 eigenvectors
## Mon Jul 27 15:04:48 2020    (internal increment: 65536)
## 
[..................................................]  0%, ETC: ---        
[==================================================] 100%, completed, 1s
## Mon Jul 27 15:04:49 2020    Done.

# combine unrelated and related PCs and order as in GDS file
pcs <- rbind(pca.unrel$eigenvect, samp.load$eigenvect)
rownames(pcs) <- c(pca.unrel$sample.id, samp.load$sample.id)
samp.ord <- match(sample.id, rownames(pcs))
pcs <- pcs[samp.ord,]

We need to determine which PCs are ancestry informative. To do this we need population information for the 1000 Genomes samples. This information is stored in an AnnotatedDataFrame, which is a data.frame with optional metadata describing the columns. The class is defined in the Biobase package. We load the stored object using the getobj function from the TopmedPipeline package.

library(Biobase)
sampfile <- "data/sample_annotation.RData"
if (!file.exists(sampfile)) download.file(file.path(repo_path, sampfile), sampfile)
annot <- TopmedPipeline::getobj(sampfile)
annot

## An object of class 'AnnotatedDataFrame'
##   rowNames: 1 2 ... 2504 (1126 total)
##   varLabels: sample.id subject.id ... status (6 total)
##   varMetadata: labelDescription

head(pData(annot))

##   sample.id subject.id Population          Population.Description sex status
## 1   HG00096    HG00096        GBR British in England and Scotland   M      0
## 2   HG00097    HG00097        GBR British in England and Scotland   F      1
## 3   HG00099    HG00099        GBR British in England and Scotland   F      0
## 4   HG00100    HG00100        GBR British in England and Scotland   F      1
## 5   HG00101    HG00101        GBR British in England and Scotland   M      0
## 6   HG00102    HG00102        GBR British in England and Scotland   F      0

varMetadata(annot)

##                                     labelDescription
## sample.id                          sample identifier
## subject.id                        subject identifier
## Population                   population abbreviation
## Population.Description        population description
## sex                                              sex
## status                 simulated case/control status

We make a parallel coordinates plot, color-coding by 1000 Genomes population. We load the dplyr package for data.frame manipulation.

pc.df <- as.data.frame(pcs)
names(pc.df) <- 1:ncol(pcs)
pc.df$sample.id <- row.names(pcs)

library(dplyr)
annot <- pData(annot) %>%
        dplyr::select(sample.id, Population)
pc.df <- left_join(pc.df, annot, by="sample.id")

library(GGally)
library(RColorBrewer)
pop.cols <- setNames(brewer.pal(12, "Paired"),
                 c("ACB", "ASW", "CEU", "GBR", "CHB", "JPT", "CLM", "MXL", "LWK", "YRI", "GIH", "PUR"))
ggparcoord(pc.df, columns=1:12, groupColumn="Population", scale="uniminmax") +
    scale_color_manual(values=pop.cols) +
    xlab("PC") + ylab("")

8.3 PC-Relate

The first 2 PCs separate populations, so we use them to compute kinship estimates adjusting for ancestry. The PC-Relate function expects a SeqVarIterator object.

seqResetFilter(gds, verbose=FALSE)
library(SeqVarTools)
seqData <- SeqVarData(gds)
seqSetFilter(seqData, variant.id=pruned)

## # of selected variants: 6,356

iterator <- SeqVarBlockIterator(seqData, verbose=FALSE)

pcrel <- pcrelate(iterator, pcs=pcs[,1:2], training.set=sampset$unrels, 
                  sample.include=sample.id)
names(pcrel)

## [1] "kinBtwn" "kinSelf"

PC-Relate is an iterative method. Now that we have ancestry-adjusted kinship estimates, we can use them to better adjust for ancestry in the PCs. This time we use the pcair function, which combines partitioning the sample set and running PCA in one step. First we need to make a kinship matrix from the PC-Relate results. The KING matrix is still used for ancestry divergence.

pcrelMat <- pcrelateToMatrix(pcrel, scaleKin=1, verbose=FALSE)

seqResetFilter(seqData, verbose=FALSE)
pca <- pcair(seqData,
             kinobj=pcrelMat, kin.thresh=2^(-9/2),
             divobj=kingMat, div.thresh=-2^(-9/2),
             sample.include=sample.id, snp.include=pruned,
             verbose=FALSE)
names(pca)

##  [1] "vectors"    "values"     "rels"       "unrels"     "kin.thresh"
##  [6] "div.thresh" "sample.id"  "nsamp"      "nsnps"      "varprop"   
## [11] "call"       "method"

pcs <- pca$vectors
pc.df <- as.data.frame(pcs)
names(pc.df) <- paste0("PC", 1:ncol(pcs))
pc.df$sample.id <- row.names(pcs)
pc.df <- left_join(pc.df, annot, by="sample.id")

ggplot(pc.df, aes(PC1, PC2, color=Population)) + geom_point() +
    scale_color_manual(values=pop.cols)

Now we use the revised PCs to compute new kinship estimates. One can run the iteration multiple times and check for conversion, but usually two rounds are sufficient.

seqSetFilter(seqData, variant.id=pruned)

## # of selected variants: 6,356

iterator <- SeqVarBlockIterator(seqData, verbose=FALSE)
pcrel <- pcrelate(iterator, pcs=pcs[,1:2], training.set=pca$unrels, 
                  sample.include=sample.id)
save(pcrel, file="data/pcrelate_kinship.RData")

We plot the kinship estimates from PC-Relate, and notice that the values for less related pairs are much better behaved.

kinship <- pcrel$kinBtwn

ggplot(kinship, aes(k0, kin)) +
    geom_hline(yintercept=2^(-seq(3,9,2)/2), linetype="dashed", color="grey") +
    geom_point(alpha=0.5) +
    ylab("kinship estimate") +
    theme_bw()

seqClose(gds)

8.4 Comparison with pedigree

We can detect pedigree errors and sample identity problems by comparing the pedigree with empirical kinship estimates. We use a function from the GWASTools package, pedigreePairwiseRelatedness, to get expected pairwise relationships based on the pedigree.

pedfile <- "data/pedigree.RData"
if (!file.exists(pedfile)) download.file(file.path(repo_path, pedfile), pedfile)
ped <- TopmedPipeline::getobj(pedfile)
head(ped)

##   family individ  father  mother sex
## 1   BB01 HG01879       0       0   M
## 2   BB01 HG01880       0       0   F
## 3   BB01 HG01881 HG01879 HG01880   F
## 4   BB02 HG01882       0       0   M
## 5   BB02 HG01883       0       0   F
## 6   BB02 HG01888 HG01882 HG01883   M

pw <- GWASTools::pedigreePairwiseRelatedness(ped)
names(pw)

## [1] "inbred.fam"  "inbred.KC"   "relativeprs"

rel <- pw$relativeprs
head(rel)

##   Individ1 Individ2 relation kinship family
## 1  HG01879  HG01880        U    0.00   BB01
## 2  HG01879  HG01881       PO    0.25   BB01
## 3  HG01880  HG01881       PO    0.25   BB01
## 4  HG01882  HG01883        U    0.00   BB02
## 5  HG01882  HG01888       PO    0.25   BB02
## 6  HG01883  HG01888       PO    0.25   BB02

table(rel$relation)

## 
##   Av   FS GpGc  HAv   HS   PO    U 
##    2    6   16    1    3  616  330

distinct(rel, relation, kinship) %>%
    arrange(-kinship)

##   relation kinship
## 1       PO  0.2500
## 2       FS  0.2500
## 3       HS  0.1250
## 4     GpGc  0.1250
## 5       Av  0.1250
## 6      HAv  0.0625
## 7        U  0.0000

## assign degrees to expected relationship pairs
rel <- rel %>%
    mutate(exp.rel=ifelse(kinship == 0.125, "Deg2", ifelse(kinship == 0.0625, "Deg3", relation)),
           pair=GWASTools::pasteSorted(Individ1, Individ2)) %>%
    select(pair, family, relation, exp.rel)

## assign degrees to observed relationship pairs
cut.dup <- 1/(2^(3/2))
cut.deg1 <- 1/(2^(5/2))
cut.deg2 <- 1/(2^(7/2))
cut.deg3 <- 1/(2^(9/2))
cut.k0 <- 0.1
kinship <- kinship %>%
    mutate(obs.rel=ifelse(kin > cut.dup, "Dup",
                   ifelse(kin > cut.deg1 & k0 < cut.k0, "PO",
                   ifelse(kin > cut.deg1, "FS",
                   ifelse(kin > cut.deg2, "Deg2",
                   ifelse(kin > cut.deg3, "Deg3", "U"))))))
table(kinship$obs.rel)

## 
## Deg2 Deg3   FS   PO    U 
##    4    6    2    7 4931

# merge observed and expected relationships
kin.obs <- kinship %>%
    select(ID1, ID2, kin, k0, obs.rel) %>%
    mutate(pair=GWASTools::pasteSorted(ID1, ID2)) %>%
    left_join(rel, by="pair") %>%
    select(-pair) %>%
    mutate(exp.rel=ifelse(is.na(exp.rel), "U", exp.rel)) %>%
    filter(!(exp.rel == "U" & obs.rel == "U"))
table(kin.obs$exp.rel, kin.obs$obs.rel)

##    
##     Deg2 Deg3 FS PO
##   U    4    6  2  7

ggplot(kin.obs, aes(k0, kin, color=obs.rel)) + geom_point()

All the observed relationships were unexpected. These samples are from 1000 Genomes sequencing, and known relatives were excluded from the released data. Here we have detected some cryptic relatives that were not annotated in the pedigree.

8.5 Exercise

Complete one round of iteration using all samples from the test dataset and plot the results. Be sure to examine the parallel coordinates plot to determine the appropriate number of PCs to give as an argument to pcrelate.