13 Annotation - Solutions

Since we are working with a subset of the data, many of the genes listed in group_id have a very small number of variants. Create a new set of units based on position rather than gene name, using the TopmedPipeline function aggregateGRanges. Then run SKAT using those units and a SeqVarRangeIterator.

minp <- min(aggunit1$pos)
maxp <- max(aggunit1$pos)
aggByPos <- data.frame(chr=1,
               start=seq(minp, maxp-1e6, length.out=10),
               end=seq(minp+1e6, maxp, length.out=10))
aggByPos$group_id <- 1:nrow(aggByPos)

head(aggByPos)

##   chr     start       end group_id
## 1   1   1025045   2025045        1
## 2   1  28440219  29440219        2
## 3   1  55855393  56855393        3
## 4   1  83270568  84270568        4
## 5   1 110685742 111685742        5
## 6   1 138100916 139100916        6

aggVarList <- aggregateGRanges(aggByPos)
aggVarList

## GRanges object with 10 ranges and 0 metadata columns:
##      seqnames              ranges strand
##         <Rle>           <IRanges>  <Rle>
##    1        1     1025045-2025045      *
##    2        1   28440219-29440219      *
##    3        1   55855393-56855393      *
##    4        1   83270567-84270567      *
##    5        1 110685741-111685741      *
##    6        1 138100916-139100916      *
##    7        1 165516090-166516090      *
##    8        1 192931264-193931264      *
##    9        1 220346438-221346438      *
##   10        1 247761613-248761613      *
##   -------
##   seqinfo: 1 sequence from an unspecified genome; no seqlengths

seqResetFilter(seqData, verbose=FALSE)
iterator <- SeqVarRangeIterator(seqData, variantRanges=aggVarList, verbose=FALSE)
assoc <- assocTestAggregate(iterator, nullmod, test="SKAT", AF.max=0.1, weight.beta=c(1,25))

## # of selected samples: 100

head(assoc$results)

##   n.site n.alt n.sample.alt         Q      pval err pval.method
## 1      1     1            1 3.1746150 0.3545563   0 integration
## 2      3    11           10 4.0727682 0.8022399   0 saddlepoint
## 3      1     5            5 1.6067904 0.5884993   0 integration
## 4      2     3            3 0.8777963 0.8930198   0 saddlepoint
## 5      0     0            0        NA        NA  NA        <NA>
## 6      0     0            0        NA        NA  NA        <NA>

head(assoc$variantInfo)

## $`1`
##   variant.id chr     pos allele.index n.obs  freq MAC   weight
## 3          5   1 1472676            1   100 0.005   1 22.16634
## 
## $`2`
##   variant.id chr      pos allele.index n.obs  freq MAC    weight
## 1        152   1 28527425            1   100 0.010   2 19.641954
## 3        154   1 28844055            1   100 0.005   1 22.166338
## 4        155   1 29149880            1   100 0.040   8  9.385331
## 
## $`3`
##   variant.id chr      pos allele.index n.obs  freq MAC   weight
## 1        273   1 56084960            1   100 0.025   5 13.61604
## 
## $`4`
##   variant.id chr      pos allele.index n.obs  freq MAC   weight
## 3        424   1 83792600            1   100 0.010   2 19.64195
## 5        426   1 84152287            1   100 0.005   1 22.16634
## 
## $`5`
## [1] variant.id chr        pos        n.obs      freq       MAC        weight    
## <0 rows> (or 0-length row.names)
## 
## $`6`
## [1] variant.id chr        pos        n.obs      freq       MAC        weight    
## <0 rows> (or 0-length row.names)

seqClose(gds)