We have uploaded GIANT Genomic Summary Results (GSR) to this workspace. In this notebook, we prepare the metadata about the analysis for import into an AnVIL workspace.
# get the latest version of AnvilDataModels from github
#remotes::install_github("UW-GAC/AnvilDataModels")
library(AnVIL)
library(AnvilDataModels)
library(tidyverse)
Metadata describing the analysis is stored in the association_analysis table. We save this as a set of “field” and “value” pairs for input to the workflow that assigns a unique identifier for each analysis.
fields <- list(
gsr_source = "https://portals.broadinstitute.org/collaboration/giant/index.php/GIANT_consortium_data_files",
pubmed_id = "29273807",
first_author = "V Turcot",
gsr_source_url = "https://doi.org/10.1038%2Fs41588-017-0011-x",
consent_code = "NRES",
upload_date = "2023-01-03",
contributor_contact = "sdmorris@uw.edu",
trait = "BMI",
trait_type = "quantitative",
trait_unit = "kg / m^2",
trait_transformation = "inverse normal",
trait_definition = "weight / height",
covariates = "age | age^2 | ancestry PCs",
concept_id = "3038553",
reference_assembly = "GRCh37",
n_variants = "246328",
genotyping_technology = "exome array",
genotyping_platform = "Illumina",
is_imputed = "FALSE",
n_samp = "526508",
n_effective = "526508",
age_min = "18",
cohorts = "GIANT",
is_meta_analysis = "TRUE",
population_descriptor = "reported ancestry",
population_labels = "European | South Asian | African | East Asian | Hispanic",
population_proportions = "85 | 6 | 5 | 2 | 2",
countries_of_recruitment = "Australia | Bangladesh | China | Denmark | Estonia | Finnland | Germany | Greece | Iceland | Ireland | Italy | Netherlands | Norway | Philippines | Sweden | Taiwan | UK | USA",
analysis_method = "score-statistics-based association analysis",
analysis_software = "RAREMETALWORKER | RVTEST"
)
analysis <- tibble(field=names(fields),
value=unlist(fields))
The GIANT data is distributed as a single file per analysis. We read in the data, update the column names so it conforms to the PRIMED data model, and write each chromosome to a separate file.
bucket <- avbucket()
bmi_file <- "BMI_All_ancestry.txt"
gsutil_cp(file.path(bucket, bmi_file), ".")
Copying gs://fc-19bbd8d9-a925-4192-b60e-f5966e7d397c/BMI_All_ancestry.txt...
/ [0/1 files][ 0.0 B/ 14.9 MiB] 0% Done
/ [1/1 files][ 14.9 MiB/ 14.9 MiB] 100% Done
-
Operation completed over 1 objects/14.9 MiB.
dat <- read_tsv(bmi_file)
Rows: 246328 Columns: 10
── Column specification ──────────────────────────────────────────────────
Delimiter: "\t"
chr (6): CHR, REF, ALT, SNPNAME, GMAF, ExAC_MAF
dbl (4): POS, beta, se, Pvalue
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(dat)
dat2 <- dat %>%
rename(chromosome = CHR,
position = POS,
rsID = SNPNAME,
ref_allele = REF,
alt_allele = ALT,
p_value = Pvalue) %>%
mutate(strand = "+",
effect_allele = ref_allele,
other_allele = alt_allele,
effect_allele_freq = NA,
n_samp = NA,
is_imputed = FALSE,
direction_of_effect = NA,
heterogeneity_p_value = NA,
heterogeneity_I2 = NA)
Files are linked to analyses. The md5 hash of each file is used to generate the primary key for the ‘file’ table. The md5 should be computed before uploading to the workspace and checked after upload to make sure the upload was successful.
chrs <- unique(dat2$chromosome)
chr_file <- lapply(chrs, function(c) {
# select only this chromosome
dat_chr <- filter(dat2, chromosome == c)
chr_file <- paste0("BMI_All_ancestry_chr", c, ".txt")
# write tsv file
write_tsv(dat_chr, file=chr_file, na="")
# get md5sum for file table
md5 <- tools::md5sum(chr_file)
# copy file to google bucket
gsutil_cp(chr_file, bucket)
## return row for file table
tibble(
md5sum = md5,
n_variants = nrow(dat_chr),
file_path = file.path(bucket, chr_file),
file_type = "data",
chromosome = c
)
})
To check the tables using a workflow, they must be written as files to the workspace bucket.
outfile <- "GIANT_BMI_file_table.tsv"
write_tsv(file_table, outfile)
gsutil_cp(outfile, bucket)
outfile <- "GIANT_BMI_analysis_table.tsv"
write_tsv(analysis, outfile)
gsutil_cp(outfile, bucket)
Once all tables have been created, we can check that they conform to
the data model. This is most easily accomplished by providing the paths
to the tables in TSV format as input to the
validate_gsr_model
workflow. This workflow will also check
that md5sums in the association_file
table match the files
in the google bucket, and that all data files match the data
dictionary.