13 Annotation - Solutions
Since we are working with a subset of the data, many of the genes listed in group_id
have a very small number of variants. Create a new set of units based on position rather than gene name, using the TopmedPipeline function aggregateGRanges
. Then run SKAT using those units and a SeqVarRangeIterator
.
minp <- min(aggunit1$pos)
maxp <- max(aggunit1$pos)
aggByPos <- data.frame(chr=1,
start=seq(minp, maxp-1e6, length.out=10),
end=seq(minp+1e6, maxp, length.out=10))
aggByPos$group_id <- 1:nrow(aggByPos)
head(aggByPos)
## chr start end group_id
## 1 1 1025045 2025045 1
## 2 1 28440219 29440219 2
## 3 1 55855393 56855393 3
## 4 1 83270568 84270568 4
## 5 1 110685742 111685742 5
## 6 1 138100916 139100916 6
aggVarList <- aggregateGRanges(aggByPos)
aggVarList
## GRanges object with 10 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## 1 1 1025045-2025045 *
## 2 1 28440219-29440219 *
## 3 1 55855393-56855393 *
## 4 1 83270567-84270567 *
## 5 1 110685741-111685741 *
## 6 1 138100916-139100916 *
## 7 1 165516090-166516090 *
## 8 1 192931264-193931264 *
## 9 1 220346438-221346438 *
## 10 1 247761613-248761613 *
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
seqResetFilter(seqData, verbose=FALSE)
iterator <- SeqVarRangeIterator(seqData, variantRanges=aggVarList, verbose=FALSE)
assoc <- assocTestAggregate(iterator, nullmod, test="SKAT", AF.max=0.1, weight.beta=c(1,25))
## # of selected samples: 100
head(assoc$results)
## n.site n.alt n.sample.alt Q_0 pval_0 err_0
## 1 1 1 1 3.1746130 0.3545554 0
## 2 3 11 10 4.0726476 0.8026375 0
## 3 1 5 5 1.6067817 0.5885020 0
## 4 2 3 3 0.8778105 0.8930184 0
## 5 0 0 0 NA NA NA
## 6 0 0 0 NA NA NA
head(assoc$variantInfo)
## $`1`
## variant.id chr pos allele.index n.obs freq weight
## 3 5 1 1472676 1 100 0.005 22.16634
##
## $`2`
## variant.id chr pos allele.index n.obs freq weight
## 1 152 1 28527425 1 100 0.010 19.641954
## 3 154 1 28844055 1 100 0.005 22.166338
## 4 155 1 29149880 1 100 0.040 9.385331
##
## $`3`
## variant.id chr pos allele.index n.obs freq weight
## 1 273 1 56084960 1 100 0.025 13.61604
##
## $`4`
## variant.id chr pos allele.index n.obs freq weight
## 3 424 1 83792600 1 100 0.010 19.64195
## 5 426 1 84152287 1 100 0.005 22.16634
##
## $`5`
## [1] variant.id chr pos n.obs freq weight
## <0 rows> (or 0-length row.names)
##
## $`6`
## [1] variant.id chr pos n.obs freq weight
## <0 rows> (or 0-length row.names)
seqClose(gds)