11 Variant annotation

11.1 Using Bioconductor annotation resources

In this example, we illustrate defining aggregate units based on known genes.

First, we load the null mixed model and open the GDS file.

modfile <- "data/null_mixed_model.RData"
nullmod <- TopmedPipeline::getobj(modfile)

sampfile <- "data/sample_phenotype_annotation.RData"
annot <- TopmedPipeline::getobj(sampfile)

gdsfile <- "data/1KG_phase3_subset_chr1.gds"
library(SeqVarTools)
gds <- seqOpen(gdsfile)
seqData <- SeqVarData(gds, sampleData=annot)

We use the human genome annotation from Bioconductor to identify genes.

library(GenomicRanges)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)

# return the variants in seqData as a GRanges object
gr <- granges(gds)
gr
## GRanges object with 1120 ranges and 0 metadata columns:
##        seqnames              ranges strand
##           <Rle>           <IRanges>  <Rle>
##      1        1              970546      *
##      2        1              985900      *
##      3        1             1025045      *
##      4        1             1265550      *
##      5        1             1472676      *
##    ...      ...                 ...    ...
##   1116        1           248715186      *
##   1117        1 248715606-248715610      *
##   1118        1           248761613      *
##   1119        1           248894546      *
##   1120        1           249149558      *
##   -------
##   seqinfo: 1 sequence from an unspecified genome; no seqlengths
# find variants that overlap with each gene
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
gr <- renameSeqlevels(gr, paste0("chr", seqlevels(gr)))
ts <- transcriptsByOverlaps(txdb, gr, columns="GENEID")
# simplistic example - define genes as overlapping transcripts
genes <- reduce(ts)
genes <- renameSeqlevels(genes, sub("chr", "", seqlevels(genes)))
genes
## GRanges object with 384 ranges and 0 metadata columns:
##         seqnames              ranges strand
##            <Rle>           <IRanges>  <Rle>
##     [1]        1       955503-991499      +
##     [2]        1     2160134-2241652      +
##     [3]        1     2985742-3355185      +
##     [4]        1     6484848-6521004      +
##     [5]        1     6845384-7829766      +
##     ...      ...                 ...    ...
##   [380]        1 245912642-246670644      -
##   [381]        1 246703863-246729565      -
##   [382]        1 247108849-247242115      -
##   [383]        1 247463622-247495045      -
##   [384]        1 249144203-249153315      -
##   -------
##   seqinfo: 93 sequences (1 circular) from hg19 genome

We run a burden test, setting a maximum alternate allele frequency to exclude common variants.

# create an iterator where each successive unit is a different gene
iterator <- SeqVarRangeIterator(seqData, variantRanges=genes, verbose=FALSE)

# do a burden test on the rare variants in each gene
assoc <- assocTestAggregate(iterator, nullmod, AF.max=0.05, test="Burden")
## # of selected samples: 100
head(assoc$results)
##   n.site n.alt n.sample.alt        Score   Score.SE Score.Stat Score.pval
## 1      2    12           12 -0.307814558 0.26357660 -1.1678372 0.24287243
## 2      1     1            1  0.024908204 0.07345671  0.3390868 0.73454432
## 3      1     2            2  0.200506079 0.11801693  1.6989604 0.08932665
## 4      2     4            4 -0.005743665 0.16583076 -0.0346357 0.97237023
## 5      1     8            8  0.029783393 0.21038637  0.1415652 0.88742344
## 6      0     0            0           NA         NA         NA         NA
##          Est    Est.SE          PVE
## 1 -4.4307319  3.793964 1.515382e-02
## 2  4.6161448 13.613460 1.277554e-03
## 3 14.3959044  8.473361 3.207185e-02
## 4 -0.2088617  6.030244 1.332924e-05
## 5  0.6728821  4.753160 2.226746e-04
## 6         NA        NA           NA
head(assoc$variantInfo)
## [[1]]
##   variant.id chr    pos allele.index n.obs  freq MAC weight
## 1          1   1 970546            1   100 0.015   3      1
## 2          2   1 985900            1   100 0.045   9      1
## 
## [[2]]
##   variant.id chr     pos allele.index n.obs  freq MAC weight
## 1          7   1 2185887            1   100 0.005   1      1
## 
## [[3]]
##   variant.id chr     pos allele.index n.obs freq MAC weight
## 2         15   1 3293503            1   100 0.01   2      1
## 
## [[4]]
##   variant.id chr     pos allele.index n.obs freq MAC weight
## 1         36   1 6489967            1   100 0.01   2      1
## 2         37   1 6503405            1   100 0.01   2      1
## 
## [[5]]
##   variant.id chr     pos allele.index n.obs freq MAC weight
## 2         39   1 7056029            1   100 0.04   8      1
## 
## [[6]]
## [1] variant.id   chr          pos          allele.index n.obs       
## [6] freq         MAC          weight      
## <0 rows> (or 0-length row.names)

11.2 Aggregating and filtering variants using annotation

Alternatively, we may want to import annotation from other software, such as ANNOVAR or WGSA. The output formats of variant annotation software can be quite complex, but for this exercise we use fairly simple tab-separated text files.

library(dplyr)
snv_annotation <- read.table("data/snv_parsed.tsv", sep="\t", na.strings=".", header=TRUE, as.is=TRUE)
indel_annotation <- read.table("data/indel_parsed.tsv", sep="\t", na.strings=".", header=TRUE, as.is=TRUE)
combined_annotation <- bind_rows(snv_annotation, indel_annotation)

Here we remove variants that are not associated with a gene, group the variants by gene, and filter the variants for intron_variants with a CADD_phred score greater than 3 in just a few lines of code:

combined_annotation %>% 
  filter(VEP_ensembl_Gene_ID != ".") %>% # remove variants not annotated with a Gene_ID
  group_by(VEP_ensembl_Gene_ID) %>% # aggregate by gene
  filter(CADD_phred > 3) %>% # filter variants to keep only CADD_phred greater than 3
  filter(stringr::str_detect(VEP_ensembl_Consequence, "intron_variant")) %>% # keep intron variants
  glimpse() # view the result - 592 variants
## Rows: 592
## Columns: 7
## Groups: VEP_ensembl_Gene_ID [170]
## $ CHROM                   <int> 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 2…
## $ POS                     <int> 15699830, 15699830, 16437047, 16445862, 16813…
## $ REF                     <chr> "G", "G", "G", "C", "C", "G", "G", "G", "G", …
## $ ALT                     <chr> "A", "A", "A", "A", "T", "A", "A", "A", "A", …
## $ VEP_ensembl_Gene_ID     <chr> "ENSG00000198062", "ENSG00000198062", "ENSG00…
## $ VEP_ensembl_Consequence <chr> "intron_variant,NMD_transcript_variant", "int…
## $ CADD_phred              <dbl> 3.612, 3.612, 9.729, 3.895, 7.530, 5.332, 5.3…

Now that you’ve got a set of variants that you can aggregate into genic units, the data needs to be reformatted for input to the GENESIS analysis pipeline. The input to the GENESIS pipeline is a data frame with variables called group_id, chr, pos, ref, and alt. Prepare this data frame and save it for testing (You do not need to filter the variants for this exercise):

aggregates <-
  combined_annotation %>%
  filter(VEP_ensembl_Gene_ID != ".") %>% # remove variants not annotated with a Gene_ID
  group_by(VEP_ensembl_Gene_ID) %>% # aggregate by gene
  dplyr::select(group_id = VEP_ensembl_Gene_ID,
         chr = CHROM,
         pos = POS,
         ref = REF,
         alt = ALT) %>%
  glimpse # inspect the tibble
## Rows: 2,603
## Columns: 5
## Groups: group_id [598]
## $ group_id <chr> "ENSG00000230643", "ENSG00000226474", "ENSG00000231565", "EN…
## $ chr      <int> 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, …
## $ pos      <int> 15589963, 15613723, 15613723, 15628559, 15699830, 15699830, …
## $ ref      <chr> "G", "A", "A", "C", "G", "G", "G", "G", "G", "G", "G", "G", …
## $ alt      <chr> "T", "G", "G", "T", "A", "A", "A", "A", "T", "T", "T", "T", …

You can also compute some summary information about these aggregates, such as counting how many genic units we’re using:

length(unique(aggregates$group_id))
## [1] 598

We can look at the distribution of the number of variants per aggregation unit:

counts <- aggregates %>% group_by(group_id) %>% summarize(n = n())
ggplot(counts, aes(x = n)) + geom_bar()