7 Association tests

Since TOPMed has many studies with related participants, we focus on linear mixed models. Logistic mixed models are also possible using GENESIS, see the GMMAT paper.

7.1 Null model

The first step in an association test is to fit the null model. We will need an AnnotatedDataFrame with phenotypes, and a GRM. We have a sample annotation with a sample.id column matched to the GDS file, and a phenotype file with subject_id. (In this example, we use the 1000 Genomes IDs for both sample and subject ID.) For TOPMed data, it is also important to match by study, as subject IDs are not unique across studies.

# sample annotation
workshop.path <- "https://github.com/UW-GAC/topmed_workshop_2017/raw/master"
sampfile <- "sample_annotation.RData"
if (!file.exists(sampfile)) download.file(file.path(workshop.path, sampfile), sampfile)
annot <- TopmedPipeline::getobj(sampfile)
library(Biobase)
head(pData(annot))

##   sample.id subject.id Population          Population.Description sex
## 1   HG00096    HG00096        GBR British in England and Scotland   M
## 2   HG00097    HG00097        GBR British in England and Scotland   F
## 3   HG00099    HG00099        GBR British in England and Scotland   F
## 4   HG00100    HG00100        GBR British in England and Scotland   F
## 5   HG00101    HG00101        GBR British in England and Scotland   M
## 6   HG00102    HG00102        GBR British in England and Scotland   F
##   status
## 1      0
## 2      1
## 3      0
## 4      1
## 5      0
## 6      0

# phenotypes by subject ID
phenfile <- "phenotype_annotation.RData"
if (!file.exists(phenfile)) download.file(file.path(workshop.path, phenfile), phenfile)
phen <- TopmedPipeline::getobj(phenfile)
head(pData(phen))

##   subject_id sex age height   study
## 1    HG00096   M  47  165.3 study_1
## 2    HG00102   F  49  169.1 study_1
## 3    HG00112   M  46  167.9 study_1
## 4    HG00114   M  49  169.5 study_1
## 5    HG00115   M  35  161.1 study_1
## 6    HG00116   M  37  182.2 study_1

varMetadata(phen)

##                        labelDescription
## subject_id           subject identifier
## sex                       subject's sex
## age        age at measurement of height
## height           subject's height in cm
## study                  study identifier

# merge sample annotation with phenotypes
library(dplyr)
dat <- pData(annot) %>%
    left_join(pData(phen), by=c("subject.id"="subject_id", "sex"="sex"))
meta <- bind_rows(varMetadata(annot), varMetadata(phen)[3:5,,drop=FALSE])
annot <- AnnotatedDataFrame(dat, meta)

# load the GRM
data.path <- "https://github.com/UW-GAC/analysis_pipeline/raw/master/testdata"
grmfile <- "grm.RData"
if (!file.exists(grmfile)) download.file(file.path(data.path, grmfile), grmfile)
grm <- TopmedPipeline::getobj(grmfile)
# the row and column names of the covariance matrix must be set to sample.id
rownames(grm$grm) <- colnames(grm$grm) <- grm$sample.id

We will test for an association between genotype and height, adjusting for sex, age, and study as covariates. If the sample set involves multiple distinct groups with different variances for the phenotype, we recommend allowing the model to use heterogeneous variance among groups with the parameter group.var. We saw in a previous exercise that the variance differs by study.

library(GENESIS)
nullmod <- fitNullMM(annot, outcome="height", covars=c("sex", "age", "study"), 
                     covMatList=grm$grm, group.var="study", verbose=FALSE)

We also recommend taking an inverse normal transform of the residuals and refitting the model. This is done separately for each group, and the transformed residuals are rescaled. See the full procedure in the
pipeline documenation.

7.2 Single-variant tests

Single-variant tests are the same as in GWAS. We use the assocTestMM function in GENESIS. We have to create a SeqVarData object including both the GDS file and the sample annotation containing phenotypes.

library(SeqVarTools)
gdsfile <- "1KG_phase3_subset_chr1.gds"
if (!file.exists(gdsfile)) download.file(file.path(data.path, gdsfile), gdsfile)
gds <- seqOpen(gdsfile)
seqData <- SeqVarData(gds, sampleData=annot)
assoc <- assocTestMM(seqData, nullmod)
head(assoc)

##   snpID chr    n          MAF minor.allele        Est        SE Wald.Stat
## 1     1   1 1126 0.0039964476          alt  -1.541280  3.733069 0.1704634
## 2     2   1 1126 0.0492895204          alt  -2.252667  1.347029 2.7966615
## 3     3   1 1126 0.0004440497          alt -25.572126 11.039358 5.3659428
## 4     4   1 1126 0.0008880995          alt -10.378157 10.233569 1.0284573
## 5     5   1 1126 0.0071047957          alt   1.754369  2.635165 0.4432267
## 6     6   1 1126 0.0022202487          alt  -3.400914  5.669262 0.3598639
##    Wald.pval
## 1 0.67970027
## 2 0.09446081
## 3 0.02053369
## 4 0.31052127
## 5 0.50556915
## 6 0.54858185

We make a QQ plot to examine the results.

library(ggplot2)
qqPlot <- function(pval) {
    pval <- pval[!is.na(pval)]
    n <- length(pval)
    x <- 1:n
    dat <- data.frame(obs=sort(pval),
                      exp=x/n,
                      upper=qbeta(0.025, x, rev(x)),
                      lower=qbeta(0.975, x, rev(x)))
    
    ggplot(dat, aes(-log10(exp), -log10(obs))) +
        geom_line(aes(-log10(exp), -log10(upper)), color="gray") +
        geom_line(aes(-log10(exp), -log10(lower)), color="gray") +
        geom_point() +
        geom_abline(intercept=0, slope=1, color="red") +
        xlab(expression(paste(-log[10], "(expected P)"))) +
        ylab(expression(paste(-log[10], "(observed P)"))) +
        theme_bw()
}    

qqPlot(assoc$Wald.pval)

7.3 Exercises

Logistic regression: fitNullMM can use a binary phenotype as the outcome variable by specifying the argument family=binomial. Use the status column in the sample annotation to fit a null model for simulated case/control status, with sex and Population as covariates. Refer to the documentation for fitNulMM to see what other parameters need to be changed for a binary outcome. Then run a single-variant test using this model.

nullmod.status <- fitNullMM(annot, outcome="status", covars=c("sex", "Population"), 
                            covMatList=grm$grm, family=binomial, verbose=FALSE)
assoc <- assocTestMM(seqData, nullmod.status, test="Score")
head(assoc)

##   snpID chr    n          MAF minor.allele       Score        Var
## 1     1   1 1126 0.0039964476          alt  0.20297260 0.69451766
## 2     2   1 1126 0.0492895204          alt -2.63528220 6.97323978
## 3     3   1 1126 0.0004440497          alt -0.09788686 0.08671779
## 4     4   1 1126 0.0008880995          alt  0.81055791 0.16638869
## 5     5   1 1126 0.0071047957          alt  0.63907685 1.16113127
## 6     6   1 1126 0.0022202487          alt -0.45986601 0.40357460
##   Score.Stat Score.pval
## 1 0.05931869 0.80757604
## 2 0.99590900 0.31830244
## 3 0.11049448 0.73958188
## 4 3.94861043 0.04691009
## 5 0.35174250 0.55312840
## 6 0.52400908 0.46913652

Inverse normal transform: use the TopmedPipeline function addInvNorm to perform an inverse normal transform on the height variable. (You will need to load the TopmedPipeline library.) Inspect the code for this function by typing addInvNorm at the R prompt, so you understand what it is doing. Then for each study separately, compute a null model and do the inverse normal transform using just the values for that study. Compare these residuals with the initial residuals you obtained for that study by transforming all studies together.

library(TopmedPipeline)
annot.norm <- addInvNorm(annot, nullmod, outcome="height", covars=c("sex", "age", "study"))
head(pData(annot.norm))

##   sample.id subject.id Population          Population.Description sex
## 1   HG00096    HG00096        GBR British in England and Scotland   M
## 2   HG00097    HG00097        GBR British in England and Scotland   F
## 3   HG00099    HG00099        GBR British in England and Scotland   F
## 4   HG00100    HG00100        GBR British in England and Scotland   F
## 5   HG00101    HG00101        GBR British in England and Scotland   M
## 6   HG00102    HG00102        GBR British in England and Scotland   F
##   status age height   study resid.norm
## 1      0  47  165.3 study_1 -0.7141520
## 2      1  47  144.9 study_3 -1.0969882
## 3      0  40  185.5 study_2  0.6885301
## 4      1  45  150.5 study_3 -0.6661291
## 5      0  40  177.9 study_3  1.1300932
## 6      0  49  169.1 study_1  0.1485829

addInvNorm

## function (annot, nullmod, outcome, covars) 
## {
##     resid.str <- if (is(nullmod, "GENESIS.nullMixedModel")) 
##         "resid.marginal"
##     else "resid.response"
##     resid.norm <- rankNorm(nullmod[[resid.str]])
##     annot$resid.norm <- resid.norm[match(annot$sample.id, nullmod$scanID)]
##     if (is(nullmod, "GENESIS.nullMixedModel")) {
##         message(paste0("resid.norm = rankNorm(resid.marginal(", 
##             outcome, " ~ ", paste(c(covars, "(1|kinship)"), collapse = " + "), 
##             "))"))
##         message("Model: resid.norm ~ (1|kinship)")
##     }
##     else {
##         message(paste0("resid.norm = rankNorm(resid.response(", 
##             outcome, " ~ ", paste(covars, collapse = " + "), 
##             "))"))
##     }
##     annot
## }
## <environment: namespace:TopmedPipeline>

studies <- sort(unique(annot$study))
resid.studies <- bind_rows(lapply(studies, function(x) {
    print(x)
    annot.study <- annot[annot$study == x,]
    nullmod.study <- fitNullMM(annot.study, outcome="height", covars=c("sex", "age"), 
                               covMatList=grm$grm, verbose=FALSE)
    addInvNorm(annot.study, nullmod.study, outcome="height", covars=c("sex", "age")) %>%
        pData() %>%
        select(resid.norm) %>%
        mutate(study=x, run="separate")
}))

## [1] "study_1"
## [1] "study_2"
## [1] "study_3"

head(resid.studies)

##   resid.norm   study      run
## 1 -0.9153651 study_1 separate
## 2  0.1475896 study_1 separate
## 3 -0.6107980 study_1 separate
## 4 -0.4491363 study_1 separate
## 5 -1.2372346 study_1 separate
## 6  0.9052493 study_1 separate

library(tidyr)
dat <- pData(annot.norm) %>%
    select(resid.norm, study) %>% 
    mutate(run="combined") %>%
    bind_rows(resid.studies)
ggplot(dat, aes(study, resid.norm, fill=run)) + geom_boxplot()

dat %>% 
    group_by(study, run) %>%
    summarise(mean=mean(resid.norm), var=var(resid.norm))

## # A tibble: 6 x 4
## # Groups:   study [?]
##     study      run          mean       var
##     <chr>    <chr>         <dbl>     <dbl>
## 1 study_1 combined -6.479525e-04 0.7001043
## 2 study_1 separate -2.984253e-07 0.9992233
## 3 study_2 combined  2.711496e-03 1.1028754
## 4 study_2 separate  1.250456e-17 0.9992297
## 5 study_3 combined -2.083547e-03 1.2015368
## 6 study_3 separate  2.038445e-07 0.9992228

7.4 Sliding window tests

For rare variants, we can do burden tests or SKAT on sliding windows using the GENESIS function assocTestSeqWindow. We restrict the test to variants with alternate allele frequency < 0.1. (For real data, this threshold would be lower.) We use a flat weighting scheme.

assoc <- assocTestSeqWindow(seqData, nullmod, test="Burden", AF.range=c(0,0.1),
                            weight.beta=c(1,1), window.size=5, window.shift=2)
names(assoc)

## [1] "param"       "window"      "nsample"     "results"     "variantInfo"

head(assoc$results)

##   chr window.start window.stop n.site dup burden.skew      Score
## 1   1       966001      971000      1   0   11.036036 -0.1106805
## 2   1       968001      973000      1   1   11.036036 -0.1106805
## 3   1       970001      975000      1   1   11.036036 -0.1106805
## 4   1       982001      987000      1   0    3.041979 -1.2395028
## 5   1       984001      989000      1   1    3.041979 -1.2395028
## 6   1      1022001     1027000      1   0   33.466573 -0.2090215
##           Var Score.stat Score.pval
## 1 0.071810741  0.1705897 0.67958828
## 2 0.071810741  0.1705897 0.67958828
## 3 0.071810741  0.1705897 0.67958828
## 4 0.550238011  2.7921864 0.09472490
## 5 0.550238011  2.7921864 0.09472490
## 6 0.008173804  5.3451252 0.02078029

head(assoc$variantInfo)

##   variantID allele chr     pos n.obs         freq weight
## 1         1      1   1  970546  1126 0.0039964476      1
## 2         2      1   1  985900  1126 0.0492895204      1
## 3         3      1   1 1025045  1126 0.0004440497      1
## 4         4      1   1 1265550  1126 0.0008880995      1
## 5         5      1   1 1472676  1126 0.0071047957      1
## 6         6      1   1 1735725  1126 0.0022202487      1

qqPlot(assoc$results$Score.pval)

For SKAT, we use the Wu weights.

assoc <- assocTestSeqWindow(seqData, nullmod, test="SKAT", AF.range=c(0,0.1),
                            weight.beta=c(1,25), window.size=5, window.shift=2)
head(assoc$results)

##   chr window.start window.stop n.site dup       Q_0     pval_0 err_0
## 1   1       966001      971000      1   0  6.317493 0.67958828     0
## 2   1       968001      973000      1   1  6.317493 0.67958828     0
## 3   1       970001      975000      1   1  6.317493 0.67958828     0
## 4   1       982001      987000      1   0 84.857944 0.09472490     0
## 5   1       984001      989000      1   1 84.857944 0.09472490     0
## 6   1      1022001     1027000      1   0 26.730269 0.02078029     0

head(assoc$variantInfo)

##   variantID allele chr     pos n.obs         freq    weight
## 1         1      1   1  970546  1126 0.0039964476 22.709172
## 2         2      1   1  985900  1126 0.0492895204  7.431881
## 3         3      1   1 1025045  1126 0.0004440497 24.734926
## 4         4      1   1 1265550  1126 0.0008880995 24.472547
## 5         5      1   1 1472676  1126 0.0071047957 21.067933
## 6         6      1   1 1735725  1126 0.0022202487 23.701317

qqPlot(assoc$results$pval_0)

7.4.1 Exercise

Repeat the previous exercise on logistic regression, this time running a sliding-window test.

nullmod.status <- fitNullMM(annot, outcome="status", covars=c("sex", "Population"), 
                            covMatList=grm$grm, family=binomial, verbose=FALSE)
assoc <- assocTestSeqWindow(seqData, nullmod.status, test="Burden", AF.range=c(0,0.1),
                            weight.beta=c(1,1), window.size=5, window.shift=2)
head(assoc$results)

##   chr window.start window.stop n.site dup burden.skew       Score
## 1   1       966001      971000      1   0   11.036036  0.20297260
## 2   1       968001      973000      1   1   11.036036  0.20297260
## 3   1       970001      975000      1   1   11.036036  0.20297260
## 4   1       982001      987000      1   0    3.041979 -2.63528220
## 5   1       984001      989000      1   1    3.041979 -2.63528220
## 6   1      1022001     1027000      1   0   33.466573 -0.09788686
##          Var Score.stat Score.pval
## 1 0.69451766 0.05931869  0.8075760
## 2 0.69451766 0.05931869  0.8075760
## 3 0.69451766 0.05931869  0.8075760
## 4 6.97323978 0.99590900  0.3183024
## 5 6.97323978 0.99590900  0.3183024
## 6 0.08671779 0.11049448  0.7395819

7.5 Annotation-based aggregate tests

Note: the code and libraries in this section are under active development, and are not production-level. It is provided to give workshop participants an example of some of the kinds of analysis tasks that might be performed with TOPMed annotation data. Use the code at your own risk, and be warned that it may break in unexpected ways. Github issues and contributions are welcome!

Analysts generally aggregate rare variants for association testing to decrease multiple testing burden and increase statistical power. They can group variants that fall within arbitrary ranges (such as sliding-windows), or they can group variants with intent. For example, an analyst could aggregate variants that that fall between transcription start sites and stop sites, within coding regions, within regulatory regions, or other genomic features selected from sources like published gene models or position- or transcript-based variant annotation. An analyst could also choose to filter the variants prior or subsequent to aggregation using annotation-based criteria such as functional impact or quality scores.

To demonstrate, we will aggregate a subset of TOPMed variants from chromosome 22. The subset is a portion of TOPMed SNP and indel variants that are also in the 1000 Genomes Project. We will parse an example variant annotation file to select fields of interest, parse a GENCODE .gtf file to define our genic units, and then aggregate the selected variants into the defined genic units.

7.5.1 Working with variant annotation

Variants called from the TOPMed data set are annotated using the Whole Genome Sequence Annotator (WGSA). WGSA output files include 359 annotation fields, some of which are themselves lists of annotation values. Thus, individual variants may be annotated with more than 1000 individual fields. WGSA produces different output files for different featues. TOPMed variant annotation includes separate files for SNPs and for indels. The subsetted variant annotation files we will use for this example are available via github:

ben.workshop.path <- "https://github.com/bheavner/topmed_workshop_2017_bh/raw/master"
snpfile <- "snp.tsv.gz"
if (!file.exists(snpfile)) download.file(file.path(ben.workshop.path, snpfile), snpfile)

indelfile <- "indel.tsv.gz"
if (!file.exists(indelfile)) download.file(file.path(ben.workshop.path, indelfile), indelfile)

The WGSA output files are tab-separated text files, with one line per annotated variant. Since there are many annotation fields, these files can be unwieldy to work with directly. As an example, the first two lines of the SNP variant annotation file can be previewed within R:

readLines("snp.tsv.gz", n=2)

## [1] "#chr\tpos\tref\talt\tANNOVAR_ensembl_Effect\tANNOVAR_ensembl_Transcript_ID\tANNOVAR_ensembl_Gene_ID\tANNOVAR_ensembl_Closest_gene(intergenic_only)\tANNOVAR_ensembl_HGVSc\tANNOVAR_ensembl_HGVSp\tANNOVAR_ensembl_Exon_Rank\tANNOVAR_ensembl_summary\tSnpEff_ensembl_Effect\tSnpEff_ensembl_Effect_impact\tSnpEff_ensembl_Sequence_feature\tSnpEff_ensembl_Sequence_feature_impact\tSnpEff_ensembl_Transcript_ID\tSnpEff_ensembl_Transcript_biotype\tSnpEff_ensembl_Gene_name\tSnpEff_ensembl_Gene_ID\tSnpEff_ensembl_HGVSc\tSnpEff_ensembl_HGVSp\tSnpEff_ensembl_Protein_position/Protein_len\tSnpEff_ensembl_CDS_position/CDS_len\tSnpEff_ensembl_cDNA_position/cDNA_len\tSnpEff_ensembl_Exon_or_intron_rank/total\tSnpEff_ensembl_Distance_to_feature\tSnpEff_ensembl_Warnings\tSnpEff_ensembl_LOF/NMD\tSnpEff_ensembl_LOF/NMD_gene_name\tSnpEff_ensembl_LOF/NMD_gene_ID\tSnpEff_ensembl_LOF/NMD_num_transcripts_affected\tSnpEff_ensembl_LOF/NMD_percent_transcripts_affected\tSnpEff_ensembl_TF_binding_effect\tSnpEff_ensembl_TF_name\tSnpEff_ensembl_TF_ID\tSnpEff_ensembl_summary\tVEP_ensembl_Consequence\tVEP_ensembl_Transcript_ID\tVEP_ensembl_Gene_Name\tVEP_ensembl_Gene_ID\tVEP_ensembl_Protein_ID\tVEP_ensembl_CCDS\tVEP_ensembl_SWISSPROT\tVEP_ensembl_Codon_Change_or_Distance\tVEP_ensembl_Amino_Acid_Change\tVEP_ensembl_HGVSc\tVEP_ensembl_HGVSp\tVEP_ensembl_cDNA_position\tVEP_ensembl_CDS_position\tVEP_ensembl_Protein_position\tVEP_ensembl_Exon_or_Intron_Rank\tVEP_ensembl_STRAND\tVEP_ensembl_CANONICAL\tVEP_ensembl_LoF\tVEP_ensembl_LoF_filter\tVEP_ensembl_LoF_flags\tVEP_ensembl_LoF_info\tVEP_ensembl_summary\tANNOVAR_refseq_Effect\tANNOVAR_refseq_Transcript_ID\tANNOVAR_refseq_Gene_ID\tANNOVAR_refseq_Closest_gene(intergenic_only)\tANNOVAR_refseq_HGVSc\tANNOVAR_refseq_HGVSp\tANNOVAR_refseq_Exon_Rank\tANNOVAR_refseq_summary\tSnpEff_refseq_Effect\tSnpEff_refseq_Effect_impact\tSnpEff_refseq_Sequence_feature\tSnpEff_refseq_Sequence_feature_impact\tSnpEff_refseq_Transcript_ID\tSnpEff_refseq_Transcript_biotype\tSnpEff_refseq_Gene_name\tSnpEff_refseq_Gene_ID\tSnpEff_refseq_HGVSc\tSnpEff_refseq_HGVSp\tSnpEff_refseq_Protein_position/Protein_len\tSnpEff_refseq_CDS_position/CDS_len\tSnpEff_refseq_cDNA_position/cDNA_len\tSnpEff_refseq_Exon_or_intron_rank/total\tSnpEff_refseq_Distance_to_feature\tSnpEff_refseq_Warnings\tSnpEff_refseq_LOF/NMD\tSnpEff_refseq_LOF/NMD_gene_name\tSnpEff_refseq_LOF/NMD_gene_ID\tSnpEff_refseq_LOF/NMD_num_transcripts_affected\tSnpEff_refseq_LOF/NMD_percent_transcripts_affected\tSnpEff_refseq_TF_binding_effect\tSnpEff_refseq_TF_name\tSnpEff_refseq_TF_ID\tSnpEff_refseq_summary\tVEP_refseq_Consequence\tVEP_refseq_Transcript_ID\tVEP_refseq_Gene_Name\tVEP_refseq_Gene_ID\tVEP_refseq_Protein_ID\tVEP_refseq_Codon_Change_or_Distance\tVEP_refseq_Amino_Acid_Change\tVEP_refseq_HGVSc\tVEP_refseq_HGVSp\tVEP_refseq_cDNA_position\tVEP_refseq_CDS_position\tVEP_refseq_Protein_position\tVEP_refseq_Exon_or_Intron_Rank\tVEP_refseq_STRAND\tVEP_refseq_CANONICAL\tVEP_refseq_LoF\tVEP_refseq_LoF_filter\tVEP_refseq_LoF_flags\tVEP_refseq_LoF_info\tVEP_refseq_summary\tANNOVAR_ucsc_Effect\tANNOVAR_ucsc_Transcript_ID\tANNOVAR_ucsc_Gene_ID\tANNOVAR_ucsc_Closest_gene(intergenic_only)\tANNOVAR_ucsc_HGVSc\tANNOVAR_ucsc_HGVSp\tANNOVAR_ucsc_Exon_Rank\tANNOVAR_ucsc_summary\trs_dbSNP147\tsno_miRNA_name\tsno_miRNA_type\tUTR3_miRNA_target\tTargetScan_context++_score_percentile\tsplicing_consensus_ada_score\tsplicing_consensus_rf_score\tGWAS_catalog_rs\tGWAS_catalog_trait\tGWAS_catalog_pubmedid\tGRASP_rs\tGRASP_PMID\tGRASP_p-value\tGRASP_phenotype\tGRASP_ancestry\tGRASP_platform\tclinvar_rs\tclinvar_clnsig\tclinvar_trait\tclinvar_golden_stars\tGTEx_V6_gene\tGTEx_V6_tissue\tMAP20\tMAP35\tMAP20(+-149bp)\tMAP35(+-149bp)\tGMS_single-end\tGMS_paired-end\t1000G_strict_masked\tRepeatMasker_masked\tAncestral_allele\tAltaiNeandertal\tDenisova\tphyloP46way_primate\tphyloP46way_primate_rankscore\tphyloP46way_placental\tphyloP46way_placental_rankscore\tphyloP100way_vertebrate\tphyloP100way_vertebrate_rankscore\tphastCons46way_primate\tphastCons46way_primate_rankscore\tphastCons46way_placental\tphastCons46way_placental_rankscore\tphastCons100way_vertebrate\tphastCons100way_vertebrate_rankscore\tGERP_NR\tGERP_RS\tGERP_RS_rankscore\tSiPhy_29way_logOdds\tSiPhy_29way_logOdds_rankscore\tintegrated_fitCons_score\tintegrated_fitCons_rankscore\tintegrated_confidence_value\tGM12878_fitCons_score\tGM12878_fitCons_rankscore\tGM12878_confidence_value\tH1-hESC_fitCons_score\tH1-hESC_fitCons_rankscore\tH1-hESC_confidence_value\tHUVEC_fitCons_score\tHUVEC_fitCons_rankscore\tHUVEC_confidence_value\tGenoCanyon_score\tGenoCanyon_rankscore\t1000Gp3_AC\t1000Gp3_AF\t1000Gp3_AFR_AC\t1000Gp3_AFR_AF\t1000Gp3_EUR_AC\t1000Gp3_EUR_AF\t1000Gp3_AMR_AC\t1000Gp3_AMR_AF\t1000Gp3_EAS_AC\t1000Gp3_EAS_AF\t1000Gp3_SAS_AC\t1000Gp3_SAS_AF\tTWINSUK_AC\tTWINSUK_AF\tALSPAC_AC\tALSPAC_AF\tESP6500_AA_AC\tESP6500_AA_AF\tESP6500_EA_AC\tESP6500_EA_AF\tExAC_AC\tExAC_AF\tExAC_Adj_AC\tExAC_Adj_AF\tExAC_AFR_AC\tExAC_AFR_AF\tExAC_AMR_AC\tExAC_AMR_AF\tExAC_EAS_AC\tExAC_EAS_AF\tExAC_FIN_AC\tExAC_FIN_AF\tExAC_NFE_AC\tExAC_NFE_AF\tExAC_SAS_AC\tExAC_SAS_AF\tExAC_nonTCGA_AC\tExAC_nonTCGA_AF\tExAC_nonTCGA_Adj_AC\tExAC_nonTCGA_Adj_AF\tExAC_nonTCGA_AFR_AC\tExAC_nonTCGA_AFR_AF\tExAC_nonTCGA_AMR_AC\tExAC_nonTCGA_AMR_AF\tExAC_nonTCGA_EAS_AC\tExAC_nonTCGA_EAS_AF\tExAC_nonTCGA_FIN_AC\tExAC_nonTCGA_FIN_AF\tExAC_nonTCGA_NFE_AC\tExAC_nonTCGA_NFE_AF\tExAC_nonTCGA_SAS_AC\tExAC_nonTCGA_SAS_AF\tExAC_nonpsych_AC\tExAC_nonpsych_AF\tExAC_nonpsych_Adj_AC\tExAC_nonpsych_Adj_AF\tExAC_nonpsych_AFR_AC\tExAC_nonpsych_AFR_AF\tExAC_nonpsych_AMR_AC\tExAC_nonpsych_AMR_AF\tExAC_nonpsych_EAS_AC\tExAC_nonpsych_EAS_AF\tExAC_nonpsych_FIN_AC\tExAC_nonpsych_FIN_AF\tExAC_nonpsych_NFE_AC\tExAC_nonpsych_NFE_AF\tExAC_nonpsych_SAS_AC\tExAC_nonpsych_SAS_AF\tRegulomeDB_motif\tRegulomeDB_score\tMotif_breaking\tnetwork_hub\tENCODE_annotated\tsensitive\tultra_sensitive\ttarget_gene\tfunseq_noncoding_score\tfunseq2_noncoding_score\tfunseq2_noncoding_rankscore\tCADD_raw\tCADD_phred\tCADD_raw_rankscore\tDANN_score\tDANN_rank_score\tfathmm-MKL_non-coding_score\tfathmm-MKL_non-coding_rankscore\tfathmm-MKL_non-coding_pred\tfathmm-MKL_non-coding_group\tfathmm-MKL_coding_score\tfathmm-MKL_coding_rankscore\tfathmm-MKL_coding_pred\tfathmm-MKL_coding_group\tEigen-raw\tEigen-phred\tEigen_coding_or_noncoding\tEigen-raw_rankscore\tEigen-PC-raw\tEigen-PC-raw_rankscore\tORegAnno_type\tORegAnno_PMID\tENCODE_TFBS\tENCODE_TFBS_score\tENCODE_TFBS_cells\tENCODE_Dnase_score\tENCODE_Dnase_cells\tEnhancerFinder_general_developmental_enhancer\tEnhancerFinder_brain_enhancer\tEnhancerFinder_heart_enhancer\tEnhancerFinder_limb_enhancer\tSuperEnhancer_tissue_cell\tSuperEnhancer_RefSeq_id\tSuperEnhancer_Gene_symbol\tFANTOM5_enhancer_permissive\tFANTOM5_enhancer_robust\tFANTOM5_enhancer_target\tFANTOM5_enhancer_expressed_tissue_cell\tFANTOM5_enhancer_differentially_expressed_tissue_cell\tFANTOM5_CAGE_peak_permissive\tFANTOM5_CAGE_peak_robust\tEnsembl_Regulatory_Build_Overviews\tEnsembl_Regulatory_Build_TFBS\tEnsembl_Regulatory_Build_TFBS_prob\taaref\taaalt\tgenename\tUniprot_acc\tUniprot_id\tUniprot_aapos\tInterpro_domain\tcds_strand\trefcodon\tSLR_test_statistic \tcodonpos\tfold-degenerate\tEnsembl_geneid\tEnsembl_transcriptid\taapos\taapos_SIFT\taapos_FATHMM\tSIFT_score\tSIFT_converted_rankscore\tSIFT_pred\tPolyphen2_HDIV_score\tPolyphen2_HDIV_rankscore\tPolyphen2_HDIV_pred\tPolyphen2_HVAR_score\tPolyphen2_HVAR_rankscore\tPolyphen2_HVAR_pred\tLRT_score\tLRT_converted_rankscore\tLRT_pred\tMutationTaster_score\tMutationTaster_converted_rankscore\tMutationTaster_pred\tMutationAssessor_score\tMutationAssessor_rankscore\tMutationAssessor_pred\tFATHMM_score\tFATHMM_rankscore\tFATHMM_pred\tMetaSVM_score\tMetaSVM_rankscore\tMetaSVM_pred\tMetaLR_score\tMetaLR_rankscore\tMetaLR_pred\tReliability_index\tVEST3_score\tVEST3_rankscore\tPROVEAN_score\tPROVEAN_converted_rankscore\tPROVEAN_pred\tANNOVAR_ensembl_precedent_consequence\tANNOVAR_ensembl_precedent_gene\tunique_variant"
## [2] "22\t19055814\tC\tT\tintronic|intronic|intronic|intronic\tENST00000263196|ENST00000389262|ENST00000537045|ENST00000545799\tENSG00000070413|ENSG00000070413|ENSG00000070413|ENSG00000070413\t.|.|.|.\t.|.|.|.\t.|.|.|.\t.|.|.|.\tENSG00000070413(4):intronic(4)\tintron_variant|intron_variant|upstream_gene_variant|intron_variant|intron_variant\tMODIFIER|MODIFIER|MODIFIER|MODIFIER|MODIFIER\tdomain:LDL-receptor_class_A|.|.|.|.\tLOW|.|.|.|.\tENST00000263196|ENST00000389262|ENST00000473832|ENST00000537045|ENST00000545799\tprotein_coding|nonsense_mediated_decay|processed_transcript|protein_coding|protein_coding\tDGCR2|DGCR2|DGCR2|DGCR2|DGCR2\tENSG00000070413|ENSG00000070413|ENSG00000070413|ENSG00000070413|ENSG00000070413\tc.203-76G>A|c.-851-76G>A|n.-2397G>A|c.80-76G>A|c.203-76G>A\t.|.|.|.|.\t.|.|.|.|.\t.|.|.|.|.\t.|.|.|.|.\t2/9|2/10|.|1/8|2/10\t.|.|2397|.|.\t.|.|.|.|.\t.\t.\t.\t.\t.\t.\t.\t.\tDGCR2(7):intron_variant(4)upstream_gene_variant(1)\tintron_variant|upstream_gene_variant|intron_variant|intron_variant,NMD_transcript_variant|intron_variant\tENST00000537045|ENST00000473832|ENST00000263196|ENST00000389262|ENST00000545799\tDGCR2|DGCR2|DGCR2|DGCR2|DGCR2\tENSG00000070413|ENSG00000070413|ENSG00000070413|ENSG00000070413|ENSG00000070413\tENSP00000440062|.|ENSP00000263196|ENSP00000373914|ENSP00000445069\tCCDS54496.1|.|CCDS33598.1|.|.\tIDD_HUMAN|.|IDD_HUMAN|.|.\t.|2397|.|.|.\t.|.|.|.|.\tENST00000537045.1:c.80-76G>A|.|ENST00000263196.7:c.203-76G>A|ENST00000389262.4:c.-851-76G>A|ENST00000545799.1:c.203-76G>A\t.|.|.|.|.\t.|.|.|.|.\t.|.|.|.|.\t.|.|.|.|.\t1/8|.|2/9|2/10|2/10\t-|-|-|-|-\t.|.|YES|.|.\t.|.|.|.|.\t.|.|.|.|.\t.|.|.|.|.\t.|.|.|.|.\tDGCR2(7):intron_variant(3)intron_variant,NMD_transcript_variant(1)upstream_gene_variant(1)\tintronic|intronic|intronic|intronic\tNM_001173533|NM_001173534|NM_001184781|NM_005137\tDGCR2|DGCR2|DGCR2|DGCR2\t.|.|.|.\t.|.|.|.\t.|.|.|.\t.|.|.|.\tDGCR2(4):intronic(4)\tintron_variant|intron_variant|intron_variant|intron_variant|intron_variant\tMODIFIER|MODIFIER|MODIFIER|MODIFIER|MODIFIER\t.|.|.|.|.\t.|.|.|.|.\tNM_001173533.1|NM_001173534.1|NM_001184781.1|NM_005137.2|NR_033674.1\tprotein_coding|protein_coding|protein_coding|protein_coding|pseudogene\tDGCR2|DGCR2|DGCR2|DGCR2|DGCR2\tDGCR2|DGCR2|DGCR2|DGCR2|DGCR2\tc.80-76G>A|c.80-76G>A|c.203-76G>A|c.203-76G>A|n.328-76G>A\t.|.|.|.|.\t.|.|.|.|.\t.|.|.|.|.\t.|.|.|.|.\t1/8|1/8|2/9|2/9|1/8\t.|.|.|.|.\t.|.|.|.|.\t.\t.\t.\t.\t.\t.\t.\t.\tDGCR2(5):intron_variant(5)\tintron_variant|intron_variant|intron_variant|intron_variant|intron_variant,non_coding_transcript_variant\tNM_001173534.1|NM_001184781.1|NM_001173533.1|NM_005137.2|NR_033674.1\tDGCR2|DGCR2|DGCR2|DGCR2|DGCR2\t9993|9993|9993|9993|9993\tNP_001167005.1|NP_001171710.1|NP_001167004.1|NP_005128.1|.\t.|.|.|.|.\t.|.|.|.|.\tNM_001173534.1:c.80-76G>A|NM_001184781.1:c.203-76G>A|NM_001173533.1:c.80-76G>A|NM_005137.2:c.203-76G>A|NR_033674.1:n.328-76G>A\t.|.|.|.|.\t.|.|.|.|.\t.|.|.|.|.\t.|.|.|.|.\t1/8|2/9|1/8|2/9|1/8\t-|-|-|-|-\t.|.|.|YES|.\t.|.|.|.|.\t.|.|.|.|.\t.|.|.|.|.\t.|.|.|.|.\tDGCR2(5):intron_variant(4)intron_variant,non_coding_transcript_variant(1)\tintronic|intronic|intronic|intronic|intronic|intronic\tuc002zoq.1|uc002zor.1|uc011agr.1|uc021wkx.1|uc021wky.1|uc021wkz.1\tDGCR2|DGCR2|DGCR2|DGCR2|DGCR2|DGCR2\t.|.|.|.|.|.\t.|.|.|.|.|.\t.|.|.|.|.|.\t.|.|.|.|.|.\tDGCR2(6):intronic(6)\trs546724885\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t1.0\t1.0\t0.99285716\t1.0\t98.9773\t98.9998\tY\tN\tC\tC/C\tC/C\t-1.193\t0.05127\t-0.915\t0.09469\t-0.133\t0.32678\t0.001\t0.05886\t0.000\t0.14292\t0.000\t0.16763\t3.71\t-3.62\t0.06162\t7.0802\t0.86688\t0.084033\t0.81986\t0\t0.047999\t0.00538\t0\t0.081037\t0.85223\t0\t0.089874\t0.86623\t0\t0.998263009421969\t0.82144\t1\t1.9968051118210862E-4\t1\t7.564296520423601E-4\t0\t0.0\t0\t0.0\t0\t0.0\t0\t0.0\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tChromatin_Structure|DNase-seq\t5\t.\t.\tDHS(MCV-5|chr22:19055665-19055815)\t.\t.\t.\t1\t1.47702766604517\t0.97241\t-0.086269\t1.834\t0.34517\t0.47717823829120337\t0.25900\t0.06740\t0.15147\tN\tADB\t0.00030\t0.00848\tN\tAEFDBI\t-0.118960630637635\t3.27754\tn\t0.52812\t0.302794242594935\t0.92771\t.\t.\t.\t.\t.\t.\t.\tN\tN\tN\tN\t.\t.\t.\tN\tN\t.\t.\t.\tN\tN\topen\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tintronic\tDGCR2\tY"

The DCC has begun an R package, wgsaparsr, to begin working with WGSA output files. This package is under development, and is available on github at https://github.com/UW-GAC/wgsaparsr. For now, the package can be installed from github using the devtools package:

#library(devtools)
#devtools::install_github("UW-GAC/wgsaparsr@1.0.0.9003")
library(wgsaparsr)
#library(tidyverse) # just in case it's not loaded yet
library(tibble)
library(dplyr)
library(tidyr)
library(readr)

wgsaparsr includes a get_fields() function to list the annotation fields available in a WGSA output file:

# list all fields in an annotation file: 
get_fields("snp.tsv.gz")

##   [1] "#chr"                                                 
##   [2] "pos"                                                  
##   [3] "ref"                                                  
##   [4] "alt"                                                  
##   [5] "ANNOVAR_ensembl_Effect"                               
##   [6] "ANNOVAR_ensembl_Transcript_ID"                        
##   [7] "ANNOVAR_ensembl_Gene_ID"                              
##   [8] "ANNOVAR_ensembl_Closest_gene(intergenic_only)"        
##   [9] "ANNOVAR_ensembl_HGVSc"                                
##  [10] "ANNOVAR_ensembl_HGVSp"                                
##  [11] "ANNOVAR_ensembl_Exon_Rank"                            
##  [12] "ANNOVAR_ensembl_summary"                              
##  [13] "SnpEff_ensembl_Effect"                                
##  [14] "SnpEff_ensembl_Effect_impact"                         
##  [15] "SnpEff_ensembl_Sequence_feature"                      
##  [16] "SnpEff_ensembl_Sequence_feature_impact"               
##  [17] "SnpEff_ensembl_Transcript_ID"                         
##  [18] "SnpEff_ensembl_Transcript_biotype"                    
##  [19] "SnpEff_ensembl_Gene_name"                             
##  [20] "SnpEff_ensembl_Gene_ID"                               
##  [21] "SnpEff_ensembl_HGVSc"                                 
##  [22] "SnpEff_ensembl_HGVSp"                                 
##  [23] "SnpEff_ensembl_Protein_position/Protein_len"          
##  [24] "SnpEff_ensembl_CDS_position/CDS_len"                  
##  [25] "SnpEff_ensembl_cDNA_position/cDNA_len"                
##  [26] "SnpEff_ensembl_Exon_or_intron_rank/total"             
##  [27] "SnpEff_ensembl_Distance_to_feature"                   
##  [28] "SnpEff_ensembl_Warnings"                              
##  [29] "SnpEff_ensembl_LOF/NMD"                               
##  [30] "SnpEff_ensembl_LOF/NMD_gene_name"                     
##  [31] "SnpEff_ensembl_LOF/NMD_gene_ID"                       
##  [32] "SnpEff_ensembl_LOF/NMD_num_transcripts_affected"      
##  [33] "SnpEff_ensembl_LOF/NMD_percent_transcripts_affected"  
##  [34] "SnpEff_ensembl_TF_binding_effect"                     
##  [35] "SnpEff_ensembl_TF_name"                               
##  [36] "SnpEff_ensembl_TF_ID"                                 
##  [37] "SnpEff_ensembl_summary"                               
##  [38] "VEP_ensembl_Consequence"                              
##  [39] "VEP_ensembl_Transcript_ID"                            
##  [40] "VEP_ensembl_Gene_Name"                                
##  [41] "VEP_ensembl_Gene_ID"                                  
##  [42] "VEP_ensembl_Protein_ID"                               
##  [43] "VEP_ensembl_CCDS"                                     
##  [44] "VEP_ensembl_SWISSPROT"                                
##  [45] "VEP_ensembl_Codon_Change_or_Distance"                 
##  [46] "VEP_ensembl_Amino_Acid_Change"                        
##  [47] "VEP_ensembl_HGVSc"                                    
##  [48] "VEP_ensembl_HGVSp"                                    
##  [49] "VEP_ensembl_cDNA_position"                            
##  [50] "VEP_ensembl_CDS_position"                             
##  [51] "VEP_ensembl_Protein_position"                         
##  [52] "VEP_ensembl_Exon_or_Intron_Rank"                      
##  [53] "VEP_ensembl_STRAND"                                   
##  [54] "VEP_ensembl_CANONICAL"                                
##  [55] "VEP_ensembl_LoF"                                      
##  [56] "VEP_ensembl_LoF_filter"                               
##  [57] "VEP_ensembl_LoF_flags"                                
##  [58] "VEP_ensembl_LoF_info"                                 
##  [59] "VEP_ensembl_summary"                                  
##  [60] "ANNOVAR_refseq_Effect"                                
##  [61] "ANNOVAR_refseq_Transcript_ID"                         
##  [62] "ANNOVAR_refseq_Gene_ID"                               
##  [63] "ANNOVAR_refseq_Closest_gene(intergenic_only)"         
##  [64] "ANNOVAR_refseq_HGVSc"                                 
##  [65] "ANNOVAR_refseq_HGVSp"                                 
##  [66] "ANNOVAR_refseq_Exon_Rank"                             
##  [67] "ANNOVAR_refseq_summary"                               
##  [68] "SnpEff_refseq_Effect"                                 
##  [69] "SnpEff_refseq_Effect_impact"                          
##  [70] "SnpEff_refseq_Sequence_feature"                       
##  [71] "SnpEff_refseq_Sequence_feature_impact"                
##  [72] "SnpEff_refseq_Transcript_ID"                          
##  [73] "SnpEff_refseq_Transcript_biotype"                     
##  [74] "SnpEff_refseq_Gene_name"                              
##  [75] "SnpEff_refseq_Gene_ID"                                
##  [76] "SnpEff_refseq_HGVSc"                                  
##  [77] "SnpEff_refseq_HGVSp"                                  
##  [78] "SnpEff_refseq_Protein_position/Protein_len"           
##  [79] "SnpEff_refseq_CDS_position/CDS_len"                   
##  [80] "SnpEff_refseq_cDNA_position/cDNA_len"                 
##  [81] "SnpEff_refseq_Exon_or_intron_rank/total"              
##  [82] "SnpEff_refseq_Distance_to_feature"                    
##  [83] "SnpEff_refseq_Warnings"                               
##  [84] "SnpEff_refseq_LOF/NMD"                                
##  [85] "SnpEff_refseq_LOF/NMD_gene_name"                      
##  [86] "SnpEff_refseq_LOF/NMD_gene_ID"                        
##  [87] "SnpEff_refseq_LOF/NMD_num_transcripts_affected"       
##  [88] "SnpEff_refseq_LOF/NMD_percent_transcripts_affected"   
##  [89] "SnpEff_refseq_TF_binding_effect"                      
##  [90] "SnpEff_refseq_TF_name"                                
##  [91] "SnpEff_refseq_TF_ID"                                  
##  [92] "SnpEff_refseq_summary"                                
##  [93] "VEP_refseq_Consequence"                               
##  [94] "VEP_refseq_Transcript_ID"                             
##  [95] "VEP_refseq_Gene_Name"                                 
##  [96] "VEP_refseq_Gene_ID"                                   
##  [97] "VEP_refseq_Protein_ID"                                
##  [98] "VEP_refseq_Codon_Change_or_Distance"                  
##  [99] "VEP_refseq_Amino_Acid_Change"                         
## [100] "VEP_refseq_HGVSc"                                     
## [101] "VEP_refseq_HGVSp"                                     
## [102] "VEP_refseq_cDNA_position"                             
## [103] "VEP_refseq_CDS_position"                              
## [104] "VEP_refseq_Protein_position"                          
## [105] "VEP_refseq_Exon_or_Intron_Rank"                       
## [106] "VEP_refseq_STRAND"                                    
## [107] "VEP_refseq_CANONICAL"                                 
## [108] "VEP_refseq_LoF"                                       
## [109] "VEP_refseq_LoF_filter"                                
## [110] "VEP_refseq_LoF_flags"                                 
## [111] "VEP_refseq_LoF_info"                                  
## [112] "VEP_refseq_summary"                                   
## [113] "ANNOVAR_ucsc_Effect"                                  
## [114] "ANNOVAR_ucsc_Transcript_ID"                           
## [115] "ANNOVAR_ucsc_Gene_ID"                                 
## [116] "ANNOVAR_ucsc_Closest_gene(intergenic_only)"           
## [117] "ANNOVAR_ucsc_HGVSc"                                   
## [118] "ANNOVAR_ucsc_HGVSp"                                   
## [119] "ANNOVAR_ucsc_Exon_Rank"                               
## [120] "ANNOVAR_ucsc_summary"                                 
## [121] "rs_dbSNP147"                                          
## [122] "sno_miRNA_name"                                       
## [123] "sno_miRNA_type"                                       
## [124] "UTR3_miRNA_target"                                    
## [125] "TargetScan_context++_score_percentile"                
## [126] "splicing_consensus_ada_score"                         
## [127] "splicing_consensus_rf_score"                          
## [128] "GWAS_catalog_rs"                                      
## [129] "GWAS_catalog_trait"                                   
## [130] "GWAS_catalog_pubmedid"                                
## [131] "GRASP_rs"                                             
## [132] "GRASP_PMID"                                           
## [133] "GRASP_p-value"                                        
## [134] "GRASP_phenotype"                                      
## [135] "GRASP_ancestry"                                       
## [136] "GRASP_platform"                                       
## [137] "clinvar_rs"                                           
## [138] "clinvar_clnsig"                                       
## [139] "clinvar_trait"                                        
## [140] "clinvar_golden_stars"                                 
## [141] "GTEx_V6_gene"                                         
## [142] "GTEx_V6_tissue"                                       
## [143] "MAP20"                                                
## [144] "MAP35"                                                
## [145] "MAP20(+-149bp)"                                       
## [146] "MAP35(+-149bp)"                                       
## [147] "GMS_single-end"                                       
## [148] "GMS_paired-end"                                       
## [149] "1000G_strict_masked"                                  
## [150] "RepeatMasker_masked"                                  
## [151] "Ancestral_allele"                                     
## [152] "AltaiNeandertal"                                      
## [153] "Denisova"                                             
## [154] "phyloP46way_primate"                                  
## [155] "phyloP46way_primate_rankscore"                        
## [156] "phyloP46way_placental"                                
## [157] "phyloP46way_placental_rankscore"                      
## [158] "phyloP100way_vertebrate"                              
## [159] "phyloP100way_vertebrate_rankscore"                    
## [160] "phastCons46way_primate"                               
## [161] "phastCons46way_primate_rankscore"                     
## [162] "phastCons46way_placental"                             
## [163] "phastCons46way_placental_rankscore"                   
## [164] "phastCons100way_vertebrate"                           
## [165] "phastCons100way_vertebrate_rankscore"                 
## [166] "GERP_NR"                                              
## [167] "GERP_RS"                                              
## [168] "GERP_RS_rankscore"                                    
## [169] "SiPhy_29way_logOdds"                                  
## [170] "SiPhy_29way_logOdds_rankscore"                        
## [171] "integrated_fitCons_score"                             
## [172] "integrated_fitCons_rankscore"                         
## [173] "integrated_confidence_value"                          
## [174] "GM12878_fitCons_score"                                
## [175] "GM12878_fitCons_rankscore"                            
## [176] "GM12878_confidence_value"                             
## [177] "H1-hESC_fitCons_score"                                
## [178] "H1-hESC_fitCons_rankscore"                            
## [179] "H1-hESC_confidence_value"                             
## [180] "HUVEC_fitCons_score"                                  
## [181] "HUVEC_fitCons_rankscore"                              
## [182] "HUVEC_confidence_value"                               
## [183] "GenoCanyon_score"                                     
## [184] "GenoCanyon_rankscore"                                 
## [185] "1000Gp3_AC"                                           
## [186] "1000Gp3_AF"                                           
## [187] "1000Gp3_AFR_AC"                                       
## [188] "1000Gp3_AFR_AF"                                       
## [189] "1000Gp3_EUR_AC"                                       
## [190] "1000Gp3_EUR_AF"                                       
## [191] "1000Gp3_AMR_AC"                                       
## [192] "1000Gp3_AMR_AF"                                       
## [193] "1000Gp3_EAS_AC"                                       
## [194] "1000Gp3_EAS_AF"                                       
## [195] "1000Gp3_SAS_AC"                                       
## [196] "1000Gp3_SAS_AF"                                       
## [197] "TWINSUK_AC"                                           
## [198] "TWINSUK_AF"                                           
## [199] "ALSPAC_AC"                                            
## [200] "ALSPAC_AF"                                            
## [201] "ESP6500_AA_AC"                                        
## [202] "ESP6500_AA_AF"                                        
## [203] "ESP6500_EA_AC"                                        
## [204] "ESP6500_EA_AF"                                        
## [205] "ExAC_AC"                                              
## [206] "ExAC_AF"                                              
## [207] "ExAC_Adj_AC"                                          
## [208] "ExAC_Adj_AF"                                          
## [209] "ExAC_AFR_AC"                                          
## [210] "ExAC_AFR_AF"                                          
## [211] "ExAC_AMR_AC"                                          
## [212] "ExAC_AMR_AF"                                          
## [213] "ExAC_EAS_AC"                                          
## [214] "ExAC_EAS_AF"                                          
## [215] "ExAC_FIN_AC"                                          
## [216] "ExAC_FIN_AF"                                          
## [217] "ExAC_NFE_AC"                                          
## [218] "ExAC_NFE_AF"                                          
## [219] "ExAC_SAS_AC"                                          
## [220] "ExAC_SAS_AF"                                          
## [221] "ExAC_nonTCGA_AC"                                      
## [222] "ExAC_nonTCGA_AF"                                      
## [223] "ExAC_nonTCGA_Adj_AC"                                  
## [224] "ExAC_nonTCGA_Adj_AF"                                  
## [225] "ExAC_nonTCGA_AFR_AC"                                  
## [226] "ExAC_nonTCGA_AFR_AF"                                  
## [227] "ExAC_nonTCGA_AMR_AC"                                  
## [228] "ExAC_nonTCGA_AMR_AF"                                  
## [229] "ExAC_nonTCGA_EAS_AC"                                  
## [230] "ExAC_nonTCGA_EAS_AF"                                  
## [231] "ExAC_nonTCGA_FIN_AC"                                  
## [232] "ExAC_nonTCGA_FIN_AF"                                  
## [233] "ExAC_nonTCGA_NFE_AC"                                  
## [234] "ExAC_nonTCGA_NFE_AF"                                  
## [235] "ExAC_nonTCGA_SAS_AC"                                  
## [236] "ExAC_nonTCGA_SAS_AF"                                  
## [237] "ExAC_nonpsych_AC"                                     
## [238] "ExAC_nonpsych_AF"                                     
## [239] "ExAC_nonpsych_Adj_AC"                                 
## [240] "ExAC_nonpsych_Adj_AF"                                 
## [241] "ExAC_nonpsych_AFR_AC"                                 
## [242] "ExAC_nonpsych_AFR_AF"                                 
## [243] "ExAC_nonpsych_AMR_AC"                                 
## [244] "ExAC_nonpsych_AMR_AF"                                 
## [245] "ExAC_nonpsych_EAS_AC"                                 
## [246] "ExAC_nonpsych_EAS_AF"                                 
## [247] "ExAC_nonpsych_FIN_AC"                                 
## [248] "ExAC_nonpsych_FIN_AF"                                 
## [249] "ExAC_nonpsych_NFE_AC"                                 
## [250] "ExAC_nonpsych_NFE_AF"                                 
## [251] "ExAC_nonpsych_SAS_AC"                                 
## [252] "ExAC_nonpsych_SAS_AF"                                 
## [253] "RegulomeDB_motif"                                     
## [254] "RegulomeDB_score"                                     
## [255] "Motif_breaking"                                       
## [256] "network_hub"                                          
## [257] "ENCODE_annotated"                                     
## [258] "sensitive"                                            
## [259] "ultra_sensitive"                                      
## [260] "target_gene"                                          
## [261] "funseq_noncoding_score"                               
## [262] "funseq2_noncoding_score"                              
## [263] "funseq2_noncoding_rankscore"                          
## [264] "CADD_raw"                                             
## [265] "CADD_phred"                                           
## [266] "CADD_raw_rankscore"                                   
## [267] "DANN_score"                                           
## [268] "DANN_rank_score"                                      
## [269] "fathmm-MKL_non-coding_score"                          
## [270] "fathmm-MKL_non-coding_rankscore"                      
## [271] "fathmm-MKL_non-coding_pred"                           
## [272] "fathmm-MKL_non-coding_group"                          
## [273] "fathmm-MKL_coding_score"                              
## [274] "fathmm-MKL_coding_rankscore"                          
## [275] "fathmm-MKL_coding_pred"                               
## [276] "fathmm-MKL_coding_group"                              
## [277] "Eigen-raw"                                            
## [278] "Eigen-phred"                                          
## [279] "Eigen_coding_or_noncoding"                            
## [280] "Eigen-raw_rankscore"                                  
## [281] "Eigen-PC-raw"                                         
## [282] "Eigen-PC-raw_rankscore"                               
## [283] "ORegAnno_type"                                        
## [284] "ORegAnno_PMID"                                        
## [285] "ENCODE_TFBS"                                          
## [286] "ENCODE_TFBS_score"                                    
## [287] "ENCODE_TFBS_cells"                                    
## [288] "ENCODE_Dnase_score"                                   
## [289] "ENCODE_Dnase_cells"                                   
## [290] "EnhancerFinder_general_developmental_enhancer"        
## [291] "EnhancerFinder_brain_enhancer"                        
## [292] "EnhancerFinder_heart_enhancer"                        
## [293] "EnhancerFinder_limb_enhancer"                         
## [294] "SuperEnhancer_tissue_cell"                            
## [295] "SuperEnhancer_RefSeq_id"                              
## [296] "SuperEnhancer_Gene_symbol"                            
## [297] "FANTOM5_enhancer_permissive"                          
## [298] "FANTOM5_enhancer_robust"                              
## [299] "FANTOM5_enhancer_target"                              
## [300] "FANTOM5_enhancer_expressed_tissue_cell"               
## [301] "FANTOM5_enhancer_differentially_expressed_tissue_cell"
## [302] "FANTOM5_CAGE_peak_permissive"                         
## [303] "FANTOM5_CAGE_peak_robust"                             
## [304] "Ensembl_Regulatory_Build_Overviews"                   
## [305] "Ensembl_Regulatory_Build_TFBS"                        
## [306] "Ensembl_Regulatory_Build_TFBS_prob"                   
## [307] "aaref"                                                
## [308] "aaalt"                                                
## [309] "genename"                                             
## [310] "Uniprot_acc"                                          
## [311] "Uniprot_id"                                           
## [312] "Uniprot_aapos"                                        
## [313] "Interpro_domain"                                      
## [314] "cds_strand"                                           
## [315] "refcodon"                                             
## [316] "SLR_test_statistic "                                  
## [317] "codonpos"                                             
## [318] "fold-degenerate"                                      
## [319] "Ensembl_geneid"                                       
## [320] "Ensembl_transcriptid"                                 
## [321] "aapos"                                                
## [322] "aapos_SIFT"                                           
## [323] "aapos_FATHMM"                                         
## [324] "SIFT_score"                                           
## [325] "SIFT_converted_rankscore"                             
## [326] "SIFT_pred"                                            
## [327] "Polyphen2_HDIV_score"                                 
## [328] "Polyphen2_HDIV_rankscore"                             
## [329] "Polyphen2_HDIV_pred"                                  
## [330] "Polyphen2_HVAR_score"                                 
## [331] "Polyphen2_HVAR_rankscore"                             
## [332] "Polyphen2_HVAR_pred"                                  
## [333] "LRT_score"                                            
## [334] "LRT_converted_rankscore"                              
## [335] "LRT_pred"                                             
## [336] "MutationTaster_score"                                 
## [337] "MutationTaster_converted_rankscore"                   
## [338] "MutationTaster_pred"                                  
## [339] "MutationAssessor_score"                               
## [340] "MutationAssessor_rankscore"                           
## [341] "MutationAssessor_pred"                                
## [342] "FATHMM_score"                                         
## [343] "FATHMM_rankscore"                                     
## [344] "FATHMM_pred"                                          
## [345] "MetaSVM_score"                                        
## [346] "MetaSVM_rankscore"                                    
## [347] "MetaSVM_pred"                                         
## [348] "MetaLR_score"                                         
## [349] "MetaLR_rankscore"                                     
## [350] "MetaLR_pred"                                          
## [351] "Reliability_index"                                    
## [352] "VEST3_score"                                          
## [353] "VEST3_rankscore"                                      
## [354] "PROVEAN_score"                                        
## [355] "PROVEAN_converted_rankscore"                          
## [356] "PROVEAN_pred"                                         
## [357] "ANNOVAR_ensembl_precedent_consequence"                
## [358] "ANNOVAR_ensembl_precedent_gene"                       
## [359] "unique_variant"

Only a subset of these annotations may be necessary for a particular association test, and it is unweildy to work with all of them, so it is useful to process the WGSA output file to select fields of interest. The wgsaparsr function parse_to_file() allows field selection by name.

An additional complication in working with the WGSA output files is that some of the annotation fields are transcript-based, rather than position-based. Thus, if a variant locus is within multiple transcripts, those fields will have multiple entries (often separated by a | character). For example, annotation fields such as VEP_ensembl_Transcript_ID may have many values within a single tab-separated field.

wgsaparsr::parse_to_file() addresses this by splitting such list-fields into multiple rows. Other annotation fields for that variant are duplicated, and associated columns are filled with the same value for each transcript that a particular variant falls within. A consequence of this approach is that the processed annotation file has more lines than the WGSA output file. In freeze 4, processing expanded the annotation by a factor of about 5 - the 220 million annotations result in a 1-billion row database for subsequent aggregation.

wgsaparsr::parse_to_file() reads a snp annotation file, selects specified fields, and expands user-defined transcript-level annotation fields. It produces a tab-separated output file for subsequent analysis.

desired_columns <- 
  c(
    "`#chr`", #NOTE: backtics on #chr because it starts with special character!
    "pos",
    "ref",
    "alt",
    "rs_dbSNP147",
    # "CADDphred",
    "CADD_phred", #NOTE: different than the indel annotation file.
    "VEP_ensembl_Transcript_ID",
    "VEP_ensembl_Gene_Name",
    "VEP_ensembl_Gene_ID",
    "VEP_ensembl_Consequence",
    "VEP_ensembl_Amino_Acid_Change",
    "VEP_ensembl_LoF",
    "VEP_ensembl_LoF_filter",
    "VEP_ensembl_LoF_flags",
    "VEP_ensembl_LoF_info"
    # "1000Gp3_AF" #skipped for the workshop because code doesn't work with this variable name
    )
    
to_split <-
    c(
    "VEP_ensembl_Consequence",
    "VEP_ensembl_Transcript_ID",
    "VEP_ensembl_Gene_Name",
    "VEP_ensembl_Gene_ID",
    "VEP_ensembl_Amino_Acid_Change",
    "VEP_ensembl_LoF",
    "VEP_ensembl_LoF_filter",
    "VEP_ensembl_LoF_flags",
    "VEP_ensembl_LoF_info"
    )
  
parse_to_file("snp.tsv.gz", "parsed_snp.tsv", desired_columns, to_split, verbose = TRUE)

## [1] "Chunks: 1 Lines: <= 10000 Records in current import: 4134"

Although the output file has fewer columns than the the raw WGSA output file, this .tsv file is not particularly nice to work with in R:

readLines("parsed_snp.tsv", n=2)

## [1] "chr\tpos\tref\talt\trs_dbSNP147\tCADD_phred\tVEP_ensembl_Consequence\tVEP_ensembl_Transcript_ID\tVEP_ensembl_Gene_Name\tVEP_ensembl_Gene_ID\tVEP_ensembl_Amino_Acid_Change\tVEP_ensembl_LoF\tVEP_ensembl_LoF_filter\tVEP_ensembl_LoF_flags\tVEP_ensembl_LoF_info"
## [2] "22\t19055814\tC\tT\trs546724885\t1.834\tintron_variant\tENST00000537045\tDGCR2\tENSG00000070413\t.\t.\t.\t.\t."

However, get_fields() does work on the parsed file to view available fields:

# list all fields in an annotation file: 
get_fields("parsed_snp.tsv")

##  [1] "chr"                           "pos"                          
##  [3] "ref"                           "alt"                          
##  [5] "rs_dbSNP147"                   "CADD_phred"                   
##  [7] "VEP_ensembl_Consequence"       "VEP_ensembl_Transcript_ID"    
##  [9] "VEP_ensembl_Gene_Name"         "VEP_ensembl_Gene_ID"          
## [11] "VEP_ensembl_Amino_Acid_Change" "VEP_ensembl_LoF"              
## [13] "VEP_ensembl_LoF_filter"        "VEP_ensembl_LoF_flags"        
## [15] "VEP_ensembl_LoF_info"

The WGSA output files for indel variants differs from the output for SNPs. Some of the field names differ slightly (e.g. “CADDphred” instead of “CADD_phred”), and there are some fields of interest that include feature counts in brackets (e.g. ENCODE_Dnase_cells includes fields like 125{23}). Thus, (for now) wgsaparsr includes parse_indel_to_file(). parse_indel_to_file() is very similar to parse_to_file(), and will likely be incorporated to that function in the near future. The syntax for parse_indel_to_file() is the same as parse_to_file():

desired_columns_indel <- 
  c(
    "`#chr`", #NOTE: backtics on #chr because it starts with special character!
    "pos",
    "ref",
    "alt",
    "rs_dbSNP147",
    "CADDphred",
    #  "CADD_phred", #NOTE: different than the general annotation file.
    "VEP_ensembl_Transcript_ID",
    "VEP_ensembl_Gene_Name",
    "VEP_ensembl_Gene_ID",
    "VEP_ensembl_Consequence",
    "VEP_ensembl_Amino_Acid_Change",
    "VEP_ensembl_LoF",
    "VEP_ensembl_LoF_filter",
    "VEP_ensembl_LoF_flags",
    "VEP_ensembl_LoF_info"
    # "1000Gp3_AF"#skipped for the workshop because code doesn't work with this variable name
    )
  
parse_indel_to_file("indel.tsv.gz", "parsed_indel.tsv", desired_columns_indel, to_split, verbose = TRUE)

## [1] "Chunks: 1 Lines: <= 10000 Records in current import: 1024"

Inspection shows that the output format is the same for this function:

readLines("parsed_indel.tsv", n=2)

## [1] "chr\tpos\tref\talt\trs_dbSNP147\tCADDphred\tVEP_ensembl_Consequence\tVEP_ensembl_Transcript_ID\tVEP_ensembl_Gene_Name\tVEP_ensembl_Gene_ID\tVEP_ensembl_Amino_Acid_Change\tVEP_ensembl_LoF\tVEP_ensembl_LoF_filter\tVEP_ensembl_LoF_flags\tVEP_ensembl_LoF_info"
## [2] "22\t24215519\tC\tCGCGTGTGT\t.\t0.977\tintron_variant,non_coding_transcript_variant\tENST00000405286\tSLC2A11\tENSG00000133460\t.\t.\t.\t.\t."

Or as a list,

# list all fields in an annotation file: 
get_fields("parsed_indel.tsv")

##  [1] "chr"                           "pos"                          
##  [3] "ref"                           "alt"                          
##  [5] "rs_dbSNP147"                   "CADDphred"                    
##  [7] "VEP_ensembl_Consequence"       "VEP_ensembl_Transcript_ID"    
##  [9] "VEP_ensembl_Gene_Name"         "VEP_ensembl_Gene_ID"          
## [11] "VEP_ensembl_Amino_Acid_Change" "VEP_ensembl_LoF"              
## [13] "VEP_ensembl_LoF_filter"        "VEP_ensembl_LoF_flags"        
## [15] "VEP_ensembl_LoF_info"

If an analyst wished to filter the list of variants prior to aggregation, the processing code could be modified to apply filters during parsing, or the annotation file could be reprocessed to apply filters at this point. Alternatively, filters can also be applied subsequent to aggregation.

As insurance for this exercise, the parsed files are also available on github:

ben.workshop.path <- "https://github.com/bheavner/topmed_workshop_2017_bh/raw/master"
parsedsnpfile <- "parsed_snp.tsv"
if (!file.exists(parsedsnpfile)) download.file(file.path(ben.workshop.path, parsedsnpfile), parsedsnpfile)

parsedindelfile <- "parsed_indel.tsv"
if (!file.exists(parsedindelfile)) download.file(file.path(ben.workshop.path, parsedindelfile), parsedindelfile)

7.5.2 Defining “gene” ranges for aggregation

Aggregation requires definition of the desired aggregation units. As a relatively simple example, we will build a list of genomic ranges corresponding to genes as defined by the GENCODE Project.

The GENCODE Project’s Genomic ENCylopedia Of DNA Elements is available in the well-documented .gtf file format. Generally, .gtf files consist of 9 tab-separated fields, some of which may consist of various numbers of key:value pairs.

The DCC has begun an R package, genetable, to parse and work with .gtf files. This package is under development, and is available on github at https://github.com/UW-GAC/genetable. For now, the package can be installed from github using the devtools package:

#library(devtools)
#devtools::install_github("UW-GAC/genetable")

library(genetable)

I’ll be working with the gencode release 19 because it’s the last one on GRCh37. The file can be downloaded via https://www.gencodegenes.org/releases/19.html.

In this case, I’ve trimmed the gencode file to include only chromosome 22 feature definitions (since that’s the variant annotation set I’m using for the demo).

gtffile <- "chr22.gtf.gz"
if (!file.exists(gtffile)) download.file(file.path(ben.workshop.path, gtffile), gtffile)

gtf_source <- "chr22.gtf.gz"

The details of the genetable package are of less interest for this workshop, so we’ll just use it - we can import and tidy the .gtf file:

# import the gtf file to a tidy data frame (a tibble)
gtf <- import_gencode(gtf_source)

# look at the tibble
glimpse(gtf)

## Observations: 119,455
## Variables: 30
## $ seqname                          <chr> "chr22", "chr22", "chr22", "c...
## $ source                           <chr> "HAVANA", "HAVANA", "HAVANA",...
## $ feature                          <chr> "gene", "transcript", "exon",...
## $ start                            <int> 16062157, 16062157, 16062157,...
## $ end                              <int> 16063236, 16063236, 16062316,...
## $ score                            <chr> ".", ".", ".", ".", ".", ".",...
## $ strand                           <chr> "+", "+", "+", "+", "-", "-",...
## $ frame                            <chr> ".", ".", ".", ".", ".", ".",...
## $ gene_id                          <chr> "ENSG00000233866.1", "ENSG000...
## $ transcript_id                    <chr> "ENSG00000233866.1", "ENST000...
## $ gene_type                        <chr> "lincRNA", "lincRNA", "lincRN...
## $ gene_name                        <chr> "LA16c-4G1.3", "LA16c-4G1.3",...
## $ transcript_type                  <chr> "lincRNA", "lincRNA", "lincRN...
## $ transcript_name                  <chr> "LA16c-4G1.3", "LA16c-4G1.3-0...
## $ exon_number                      <chr> NA, NA, "1", "2", NA, NA, "1"...
## $ exon_id                          <chr> NA, NA, "ENSE00001660730.1", ...
## $ level                            <chr> "2", "2", "2", "2", "2", "2",...
## $ remap_substituted_missing_target <chr> NA, NA, NA, NA, NA, NA, NA, N...
## $ remap_target_status              <chr> NA, NA, NA, NA, NA, NA, NA, N...
## $ remap_num_mappings               <chr> NA, NA, NA, NA, NA, NA, NA, N...
## $ remap_original_location          <chr> NA, NA, NA, NA, NA, NA, NA, N...
## $ remap_original_id                <chr> NA, NA, NA, NA, NA, NA, NA, N...
## $ remap_status                     <chr> NA, NA, NA, NA, NA, NA, NA, N...
## $ transcript_support_level         <chr> NA, NA, NA, NA, NA, NA, NA, N...
## $ protein_id                       <chr> NA, NA, NA, NA, NA, NA, NA, N...
## $ havana_transcript                <chr> NA, "OTTHUMT00000276574.1", "...
## $ havana_gene                      <chr> "OTTHUMG00000140195.1", "OTTH...
## $ ccdsid                           <chr> NA, NA, NA, NA, NA, NA, NA, N...
## $ ont                              <chr> NA, NA, NA, NA, NA, "PGO:0000...
## $ tag                              <chr> NA, "basic", "basic", "basic"...

We can see that genomic features are tagged by feature type:

# summarize the number of features by tag.
summarize_tag(gtf, tag = "basic")

##                 
##                  FALSE  TRUE
##   CDS            23995 12441
##   exon           28836 16135
##   gene             184     0
##   Selenocysteine    19     6
##   start_codon     2881  1319
##   stop_codon      2235  1300
##   transcript      3901  2159
##   UTR             8512  3672

And we can use these feature type tags to filter the .gtf annotation to extract the starting and ending genomic positions for features of interest, such as features tagged “gene”:

# filter gtf file to return transcript features tagged basic
basic_transcripts <- filter_gencode(gtf, featurearg = "transcript", tagarg = "basic")

# or filter for features == "gene"
genes <- filter_gencode(gtf, featurearg = "gene")

# define the boundaries of the feature of interest
# this can be slow for complicated features
#gene_bounds <- define_boundaries(basic_transcripts, "gene_id")
gene_bounds <- define_boundaries(genes, "gene_id")

# can check the resulting tibble for sanity
glimpse(gene_bounds)

## Observations: 1,263
## Variables: 10
## $ chr             <chr> "chr22", "chr22", "chr22", "chr22", "chr22", "...
## $ strand          <chr> "+", "-", "-", "-", "+", "-", "-", "-", "+", "...
## $ gene_id         <chr> "ENSG00000233866.1", "ENSG00000229286.1", "ENS...
## $ gene_name       <chr> "LA16c-4G1.3", "LA16c-4G1.4", "LA16c-4G1.5", "...
## $ agg_start       <dbl> 16062157, 16076052, 16084249, 16100517, 161227...
## $ agg_end         <dbl> 16063236, 16076172, 16084826, 16124973, 161237...
## $ source          <chr> "HAVANA", "HAVANA", "HAVANA", "HAVANA", "HAVAN...
## $ transcript_type <chr> "lincRNA", "pseudogene", "pseudogene", "pseudo...
## $ merge_count     <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ agg_size        <dbl> 1079, 120, 577, 24456, 1048, 45025, 621, 693, ...

Finally, genetable includes a function to save a self-documented tab separated file containing the filtered .gtf results:

# save to file
note <- 'This file includes starting and ending ranges for feature = "gene" in the gtf file.'
save_to_file(gene_bounds, notes = note) # will automatically make file called feature_bounds_DATE.tsv

As insurance for this exercise, the genic range definitions that I made last week are also available on github:

ben.workshop.path <- "https://github.com/bheavner/topmed_workshop_2017_bh/raw/master"
boundsfile <- "feature_bounds_20170804.tsv"
if (!file.exists(boundsfile)) download.file(file.path(ben.workshop.path, boundsfile), boundsfile)

7.5.3 Aggregating TOPMed variants into genic units

Now we’ve generated a set of variants with a manaageable number of annotation fields, and defined the desired aggregation units as sets of genomic ranges. The set of variants may be filtered using the annotation fields we’ve chosen (our list is unfiltered in this example).

We’re ready to aggregate the variants by genic units. As insurance, we can start with the same set of inputs by downloading what I generated last week:

ben.workshop.path <- "https://github.com/bheavner/topmed_workshop_2017_bh/raw/master"

parsed_snp_file <- "parsed_snp.tsv"
parsed_indel_file <- "parsed_indel.tsv"
unit_defs_file <- "feature_bounds_20170804.tsv"

if (!file.exists(parsed_snp_file)) download.file(file.path(ben.workshop.path, parsed_snp_file), parsed_snp_file)

if (!file.exists(parsed_indel_file)) download.file(file.path(ben.workshop.path, parsed_indel_file), parsed_indel_file)

if (!file.exists(unit_defs_file)) download.file(file.path(ben.workshop.path, unit_defs_file), unit_defs_file)

Load the tab-separated files to tibbles (data frames) to work with:

snps <- read_tsv(parsed_snp_file, comment = "#")
indels <- read_tsv(parsed_indel_file, comment = "#")
unit_defs <- read_tsv(unit_defs_file, comment = "#", skip = 1)
unit_defs <- select(unit_defs, c(gene_id, agg_start, agg_end))

There’s probably a nice, fast, vectorized to accomplish this task, but for demonstration purposes, we’ll just loop over the unit_defs and select indels and snps within the genomic ranges of interest:

# make an empty tibble
foo <- tibble(group_id="", chromosome="", position="", ref="", alt="") %>%
  filter(length(group_id)>1)

# loop over unit defs
for (rowIndex in 1:nrow(unit_defs)) {

  # select snps and insert to foo ## SNPs could be filtered here
  snpsToAdd <- select(snps, c(chr, pos, ref, alt)) %>%
    dplyr::filter(between(pos, unit_defs[rowIndex,]$agg_start, unit_defs[rowIndex,]$agg_end)) %>% # This is the line to vectorize
    distinct() %>%
    mutate(group_id = unit_defs[rowIndex,]$gene_id)
  
  if (nrow(snpsToAdd) > 0) {
    foo <- add_row(
      foo,
      group_id = snpsToAdd$group_id,
      chromosome = snpsToAdd$chr,
      position = snpsToAdd$pos,
      ref = snpsToAdd$ref,
      alt = snpsToAdd$alt
    )
  }
  
  # select indels and insert to foo ## Indels could be filtered here, too
  toAdd <- select(indels, c(chr, pos, ref, alt)) %>%
    dplyr::filter(between(pos, unit_defs[rowIndex, ]$agg_start, unit_defs[rowIndex, ]$agg_end)) %>% # to vectorize
    distinct() %>%
    mutate(group_id = unit_defs[rowIndex, ]$gene_id)
  
  if (rowIndex %% 10 == 0){
    message(
      paste0("row: ", rowIndex, 
             " snps to add: ", nrow(snpsToAdd), 
             " indels to add: ", nrow(toAdd)))
  }
  
  if (nrow(toAdd) > 0) {
    foo <- add_row(
      foo,
      group_id = toAdd$group_id,
      chromosome = toAdd$chr,
      position = toAdd$pos,
      ref = toAdd$ref,
      alt = toAdd$alt
    )
  }
}

aggregated_variants <- distinct(foo)

That may not be fast or pretty, but we’ve now got a set of variants aggregated into genic units using the GENCODE gene model! This set can be saved and used with the analysis pipeline for association testing.

We can inspect the tibble with glimpse:

glimpse(aggregated_variants)

## Observations: 794
## Variables: 5
## $ group_id   <chr> "ENSG00000231565.1", "ENSG00000237299.1", "ENSG0000...
## $ chromosome <chr> "22", "22", "22", "22", "22", "22", "22", "22", "22...
## $ position   <chr> "16365024", "16425814", "16425814", "16926604", "16...
## $ ref        <chr> "A", "C", "C", "C", "C", "G", "G", "C", "G", "A", "...
## $ alt        <chr> "C", "T", "T", "T", "T", "A", "A", "T", "A", "G", "...

We can do things like counting how many genic units we’re using:

distinct(as.tibble(aggregated_variants$group_id))

## # A tibble: 270 x 1
##                 value
##                 <chr>
##  1  ENSG00000231565.1
##  2  ENSG00000237299.1
##  3  ENSG00000235759.1
##  4  ENSG00000227367.1
##  5 ENSG00000100181.17
##  6  ENSG00000172967.7
##  7  ENSG00000215568.3
##  8  ENSG00000237438.2
##  9 ENSG00000093072.11
## 10  ENSG00000241832.1
## # ... with 260 more rows

We can look at number of variants per aggregation unit:

counts <- aggregated_variants %>% group_by(group_id) %>% summarize(n())

Feel free to look at other summary statistics and do other exploratory data analysis as you’d like, but don’t forget to save it if you’d like to use it for the analysis pipeline!

save(aggregated_variants, file = "chr22_gene_aggregates.RDA")

7.5.4 Aggregate unit for association testing exercise

We will be using a slightly different gene-based aggregation unit for the assocation testing exercise. As before, this analysis uses a subset of the TOPMed SNP variants that are present in the 1000 Genomes Project. However, in this exercise, the genic units include TOPMed SNP variants from all chromosomes (no indels, and not just chromosome 22 as before). Further, each genic unit is expanded to include the set of TOPMed SNP variants falling within a GENCODE-defined gene along with 20 kb flanking regions upstream and downstream of that range.

In a larger-scale analysis of TOPMed data, aggregation units could include both TOPMed SNP and indel variants falling within defined aggregation units, and would not be restricted to the variants also present in this chosen subset of the 1000 Genomes Project. An analyst might also choose to filter variants within each unit based on various annotations (examples include loss of function, conservation, deleteriousness scores, etc.).

As before, the aggregation units are defined in an R dataframe. Each row of the dataframe specifies a variant (chromosome, position, ref, alt) and the group identifier (group_id) assigned to it. Mutiple rows with different group identifiers can be specified to assign a variant to different groups (for example a variant can be assigned to mutiple genes).

aggfile <- "variants_by_gene.RData"
if (!file.exists(aggfile)) download.file(file.path(workshop.path, aggfile), aggfile)
aggunit <- TopmedPipeline::getobj(aggfile)
names(aggunit)

## [1] "group_id"   "chromosome" "position"   "ref"        "alt"

head(aggunit)

## # A tibble: 6 x 5
##             group_id chromosome position   ref   alt
##                <chr>      <chr>    <int> <chr> <chr>
## 1  ENSG00000000005.5          X 99850725     A     G
## 2  ENSG00000000938.8          1 27960254     A     G
## 3  ENSG00000001084.6          6 53357691     C     T
## 4  ENSG00000001084.6          6 53413986     T     C
## 5  ENSG00000001084.6          6 53466979     C     T
## 6 ENSG00000001167.10          6 41064020     A     G

# an example of variant that is present in mutiple groups
library(dplyr)
mult <- aggunit %>%
    group_by(chromosome, position) %>%
    summarise(n=n()) %>%
    filter(n > 1)
inner_join(aggunit, mult[2,1:2])

## # A tibble: 4 x 5
##            group_id chromosome position   ref   alt
##               <chr>      <chr>    <int> <chr> <chr>
## 1 ENSG00000188157.9          1   985900     C     T
## 2 ENSG00000217801.5          1   985900     C     T
## 3 ENSG00000242590.1          1   985900     C     T
## 4 ENSG00000273443.1          1   985900     C     T

7.5.5 Association testing with aggregate units

We can run a burden test or SKAT on each of these units using the GENESIS function assocTestSeq. This function expects a list, where each element of the list is a dataframe representing a single aggregation unit and containing the unique variant.id assigned to each variant in a GDS file. We use the TopmedPipeline function aggregateListByAllele to quickly convert our single dataframe to the required format. This function can account for multiallelic variants (the same chromosome, position, and ref, but different alt alleles). The first argument is the GDS object returned by seqOpen (see above).

library(TopmedPipeline)
aggVarList <- aggregateListByAllele(gds, aggunit)
length(aggVarList)

## [1] 932

head(names(aggVarList))

## [1] "ENSG00000188157.9"  "ENSG00000242590.1"  "ENSG00000217801.5" 
## [4] "ENSG00000273443.1"  "ENSG00000131591.13" "ENSG00000237330.2"

aggVarList[[1]]

##   variant.id chromosome position ref nAlleles allele allele.index
## 1          1          1   970546   C        2      G            1
## 2          2          1   985900   C        2      T            1

As in the previous section, we must fit the null model before running the association test.

assoc <- assocTestSeq(seqData, nullmod, test="Burden", aggVarList=aggVarList, 
                      AF.range=c(0,0.1), weight.beta=c(1,1))
names(assoc)

## [1] "param"       "nsample"     "results"     "variantInfo"

head(assoc$results)

##                    n.site n.sample.alt burden.skew      Score         Var
## ENSG00000188157.9       2          116    2.867095 -1.3501832 0.616143773
## ENSG00000242590.1       2          116    2.867095 -1.3501832 0.616143773
## ENSG00000217801.5       1          107    3.041979 -1.2395028 0.550238011
## ENSG00000273443.1       1          107    3.041979 -1.2395028 0.550238011
## ENSG00000131591.13      1            1   33.466573 -0.2090215 0.008173804
## ENSG00000237330.2       1            1   33.466573 -0.2090215 0.008173804
##                    Score.stat Score.pval
## ENSG00000188157.9    2.958717 0.08541571
## ENSG00000242590.1    2.958717 0.08541571
## ENSG00000217801.5    2.792186 0.09472490
## ENSG00000273443.1    2.792186 0.09472490
## ENSG00000131591.13   5.345125 0.02078029
## ENSG00000237330.2    5.345125 0.02078029

head(names(assoc$variantInfo))

## [1] "ENSG00000188157.9"  "ENSG00000242590.1"  "ENSG00000217801.5" 
## [4] "ENSG00000273443.1"  "ENSG00000131591.13" "ENSG00000237330.2"

head(assoc$variantInfo[[1]])

##   variantID allele chr    pos n.obs        freq weight
## 1         1      1   1 970546  1126 0.003996448      1
## 2         2      1   1 985900  1126 0.049289520      1

qqPlot(assoc$results$Score.pval)

7.5.6 Exercise

Since we are working with a subset of the data, many of the genes listed in group_id have a very small number of variants. Create a new set of units based on position rather than gene name, using the TopmedPipeline function aggregateListByPosition. Then run SKAT using those units.

agg2 <- aggunit %>%
    mutate(chromosome=factor(chromosome, levels=c(1:22, "X"))) %>%
    select(chromosome, position) %>%
    distinct() %>%
    group_by(chromosome) %>%
    summarise(min=min(position), max=max(position))
head(agg2)

## # A tibble: 6 x 3
##   chromosome    min       max
##       <fctr>  <dbl>     <dbl>
## 1          1 970546 249149558
## 2          2 431127 242736205
## 3          3 252789 197791102
## 4          4 281442 190794470
## 5          5  49219 180633864
## 6          6 425411 170664388

aggByPos <- bind_rows(lapply(1:nrow(agg2), function(i) {
    data.frame(chromosome=agg2$chromosome[i],
               start=seq(agg2$min[i], agg2$max[i]-1e6, length.out=10),
               end=seq(agg2$min[i]+1e6, agg2$max[i], length.out=10))
})) %>%
    mutate(group_id=1:n())
head(aggByPos)

##   chromosome     start       end group_id
## 1          1    970546   1970546        1
## 2          1  28434881  29434881        2
## 3          1  55899215  56899215        3
## 4          1  83363550  84363550        4
## 5          1 110827885 111827885        5
## 6          1 138292219 139292219        6

aggVarList <- aggregateListByPosition(gds, aggByPos)
aggVarList[1:2]

## $`1`
##   variant.id chromosome position ref nAlleles allele allele.index
## 1          1          1   970546   C        2      G            1
## 2          2          1   985900   C        2      T            1
## 3          3          1  1025045   C        2      T            1
## 4          4          1  1265550   C        2      T            1
## 5          5          1  1472676   T        2      C            1
## 6          6          1  1735725   G        2      A            1
## 
## $`2`
##   variant.id chromosome position  ref nAlleles allele allele.index
## 1        152          1 28527425    C        2      T            1
## 2        153          1 28761663    C        2      G            1
## 3        154          1 28844055 CTTG        2      C            1
## 4        155          1 29149880    T        2      C            1

assoc <- assocTestSeq(seqData, nullmod, test="SKAT", aggVarList=aggVarList,
                      AF.range=c(0,0.1), weight.beta=c(1,25))
head(assoc$results)

##   n.site n.sample.alt        Q_0    pval_0 err_0
## 1      6          138 158.442480 0.3946523     0
## 2      4          137  41.551621 0.6595948     0
## 3      2           32  18.326551 0.6300152     0
## 4      5           84 250.498985 0.2346494     0
## 5      1            1   2.317198 0.4888162     0
## 7      4           81 123.961234 0.3469854     0

seqClose(gds)