This notebook demonstrates how to prepare a phenotype file for use
with the primed_genesis_gwas
workflow.
Load R libraries:
library(AnVIL)
library(dplyr)
library(readr)
library(tidyr)
library(stringr)
Phenotype file paths are stored in the “phenotype_harmonized” data
table in a PRIMED workspace. We use the avtable
function to
read this table, and find the file paths for the phenotype files we need
for this analysis. In this example, we are using a simulated LDL
phenotype found in the “pilot” phenotype file.
We also need the population_descriptor phenotype file, as we will use
“population” as a covariate in the analysis. For each table, we copy the
file from the google bucket to our compute instance and read it into
R.
pheno_tables <- avtable("phenotype_harmonized")
read_phen_table <- function(table_name) {
phen_file <- pheno_tables %>%
filter(domain == table_name) %>%
select(file_path) %>%
unlist()
gsutil_cp(phen_file, ".")
read_tsv(basename(phen_file))
}
subj <- avtable("subject")
The population_descriptor data
model allows multiple descriptors for each subject, with a
delimiter. Below is a function that extracts a single descriptor and
corresponding labels from the table.
select_pop_desc <- function(pop_desc_table, descriptor) {
dat <- pop_desc_table %>%
separate_longer_delim(starts_with("population"), delim="|") %>%
mutate(across(starts_with("population"), str_trim)) %>%
pivot_wider(names_from=population_descriptor, values_from=population_label) %>%
select(subject_id, all_of(descriptor))
}
We select the outcome and covariates we plan to use in the GWAS. The
GENESIS workflow expects a column called ‘sex’ with values ‘M’ and ‘F’,
so these values must be mapped from the PRIMED data model.
phen_gwas <- subj %>%
mutate(sex=c("Male"="M", "Female"="F")[reported_sex]) %>%
inner_join(phen) %>%
select(subject_id, sex, age_at_observation, ldl) %>%
inner_join(select_pop_desc(pop, "population"))
In the PRIMED data model, the sample table links subject identifiers
(subject_id) to the sample identifiers in the genotype files
(sample_id). The GENESIS workflow requires sample_id to match phenotypes
to genotypes.
sample <- avtable("sample")
phen_gwas_samples <- sample %>%
select(sample_id, subject_id) %>%
inner_join(phen_gwas)
The GENESIS workflow expects a phenotype file in CSV format. We save
the phenotype data table as CSV and copy it to the workspace bucket.
outfile <- "phenotypes_ldl_gwas.csv"
bucket_path <- file.path(avbucket(), "GWAS", outfile)
write_csv(phen_gwas_samples, outfile)
gsutil_cp(outfile, bucket_path)
LS0tCnRpdGxlOiAiUHJlcGFyZSBwaGVub3R5cGUgZmlsZSBmb3IgR1dBUyIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKVGhpcyBub3RlYm9vayBkZW1vbnN0cmF0ZXMgaG93IHRvIHByZXBhcmUgYSBwaGVub3R5cGUgZmlsZSBmb3IgdXNlIHdpdGggdGhlCltwcmltZWRfZ2VuZXNpc19nd2FzXShodHRwczovL2dpdGh1Yi5jb20vVVctR0FDL3ByaW1lZC1nZW5lc2lzLWd3YXMpIHdvcmtmbG93LiAKCkxvYWQgUiBsaWJyYXJpZXM6CgpgYGB7cn0KbGlicmFyeShBblZJTCkKbGlicmFyeShkcGx5cikKbGlicmFyeShyZWFkcikKbGlicmFyeSh0aWR5cikKbGlicmFyeShzdHJpbmdyKQpgYGAKClBoZW5vdHlwZSBmaWxlIHBhdGhzIGFyZSBzdG9yZWQgaW4gdGhlICJwaGVub3R5cGVfaGFybW9uaXplZCIgZGF0YSB0YWJsZSBpbiBhIApQUklNRUQgd29ya3NwYWNlLiBXZSB1c2UgdGhlIGBhdnRhYmxlYCBmdW5jdGlvbiB0byByZWFkIHRoaXMgdGFibGUsIGFuZCBmaW5kIAp0aGUgZmlsZSBwYXRocyBmb3IgdGhlIHBoZW5vdHlwZSBmaWxlcyB3ZSBuZWVkIGZvciB0aGlzIGFuYWx5c2lzLiBJbiB0aGlzIGV4YW1wbGUsCndlIGFyZSB1c2luZyBhIHNpbXVsYXRlZCBMREwgcGhlbm90eXBlIGZvdW5kIGluIHRoZSAicGlsb3QiIHBoZW5vdHlwZSBmaWxlLgoKV2UgYWxzbyBuZWVkIHRoZSBwb3B1bGF0aW9uX2Rlc2NyaXB0b3IgcGhlbm90eXBlIGZpbGUsIGFzIHdlIHdpbGwgdXNlICJwb3B1bGF0aW9uIgphcyBhIGNvdmFyaWF0ZSBpbiB0aGUgYW5hbHlzaXMuIEZvciBlYWNoIHRhYmxlLCB3ZSBjb3B5IHRoZSBmaWxlIGZyb20gdGhlIGdvb2dsZQpidWNrZXQgdG8gb3VyIGNvbXB1dGUgaW5zdGFuY2UgYW5kIHJlYWQgaXQgaW50byBSLgoKYGBge3J9CnBoZW5vX3RhYmxlcyA8LSBhdnRhYmxlKCJwaGVub3R5cGVfaGFybW9uaXplZCIpCnJlYWRfcGhlbl90YWJsZSA8LSBmdW5jdGlvbih0YWJsZV9uYW1lKSB7CiAgcGhlbl9maWxlIDwtIHBoZW5vX3RhYmxlcyAlPiUKICAgIGZpbHRlcihkb21haW4gPT0gdGFibGVfbmFtZSkgJT4lCiAgICBzZWxlY3QoZmlsZV9wYXRoKSAlPiUKICAgIHVubGlzdCgpCiAgZ3N1dGlsX2NwKHBoZW5fZmlsZSwgIi4iKQogIHJlYWRfdHN2KGJhc2VuYW1lKHBoZW5fZmlsZSkpCn0KYGBgCgpgYGB7cn0Kc3ViaiA8LSBhdnRhYmxlKCJzdWJqZWN0IikKcGhlbiA8LSByZWFkX3BoZW5fdGFibGUoInBpbG90IikKcG9wIDwtIHJlYWRfcGhlbl90YWJsZSgicG9wdWxhdGlvbl9kZXNjcmlwdG9yIikKYGBgCgpUaGUgcG9wdWxhdGlvbl9kZXNjcmlwdG9yIFtkYXRhIG1vZGVsXShodHRwczovL2dpdGh1Yi5jb20vVVctR0FDL3ByaW1lZF9kYXRhX21vZGVscy9ibG9iL21haW4vUFJJTUVEX3BoZW5vdHlwZV9kYXRhX21vZGVsLmpzb24jTDU4KQphbGxvd3MgbXVsdGlwbGUgZGVzY3JpcHRvcnMgZm9yIGVhY2ggc3ViamVjdCwgd2l0aCBhIGRlbGltaXRlci4gQmVsb3cgaXMgYSBmdW5jdGlvbgp0aGF0IGV4dHJhY3RzIGEgc2luZ2xlIGRlc2NyaXB0b3IgYW5kIGNvcnJlc3BvbmRpbmcgbGFiZWxzIGZyb20gdGhlIHRhYmxlLgoKYGBge3J9CnNlbGVjdF9wb3BfZGVzYyA8LSBmdW5jdGlvbihwb3BfZGVzY190YWJsZSwgZGVzY3JpcHRvcikgewogIGRhdCA8LSBwb3BfZGVzY190YWJsZSAlPiUKICAgIHNlcGFyYXRlX2xvbmdlcl9kZWxpbShzdGFydHNfd2l0aCgicG9wdWxhdGlvbiIpLCBkZWxpbT0ifCIpICU+JQogICAgbXV0YXRlKGFjcm9zcyhzdGFydHNfd2l0aCgicG9wdWxhdGlvbiIpLCBzdHJfdHJpbSkpICU+JQogICAgcGl2b3Rfd2lkZXIobmFtZXNfZnJvbT1wb3B1bGF0aW9uX2Rlc2NyaXB0b3IsIHZhbHVlc19mcm9tPXBvcHVsYXRpb25fbGFiZWwpICU+JQogICAgc2VsZWN0KHN1YmplY3RfaWQsIGFsbF9vZihkZXNjcmlwdG9yKSkKfQpgYGAKCldlIHNlbGVjdCB0aGUgb3V0Y29tZSBhbmQgY292YXJpYXRlcyB3ZSBwbGFuIHRvIHVzZSBpbiB0aGUgR1dBUy4gVGhlIEdFTkVTSVMgd29ya2Zsb3cKZXhwZWN0cyBhIGNvbHVtbiBjYWxsZWQgJ3NleCcgd2l0aCB2YWx1ZXMgJ00nIGFuZCAnRicsIHNvIHRoZXNlIHZhbHVlcyBtdXN0IGJlIAptYXBwZWQgZnJvbSB0aGUgUFJJTUVEIGRhdGEgbW9kZWwuCgpgYGB7cn0KcGhlbl9nd2FzIDwtIHN1YmogJT4lCiAgbXV0YXRlKHNleD1jKCJNYWxlIj0iTSIsICJGZW1hbGUiPSJGIilbcmVwb3J0ZWRfc2V4XSkgJT4lCiAgaW5uZXJfam9pbihwaGVuKSAlPiUKICBzZWxlY3Qoc3ViamVjdF9pZCwgc2V4LCBhZ2VfYXRfb2JzZXJ2YXRpb24sIGxkbCkgJT4lCiAgaW5uZXJfam9pbihzZWxlY3RfcG9wX2Rlc2MocG9wLCAicG9wdWxhdGlvbiIpKQpgYGAKCkluIHRoZSBQUklNRUQgZGF0YSBtb2RlbCwgdGhlIHNhbXBsZSB0YWJsZSBsaW5rcyBzdWJqZWN0IGlkZW50aWZpZXJzIChzdWJqZWN0X2lkKQp0byB0aGUgc2FtcGxlIGlkZW50aWZpZXJzIGluIHRoZSBnZW5vdHlwZSBmaWxlcyAoc2FtcGxlX2lkKS4gVGhlIEdFTkVTSVMgd29ya2Zsb3cKcmVxdWlyZXMgc2FtcGxlX2lkIHRvIG1hdGNoIHBoZW5vdHlwZXMgdG8gZ2Vub3R5cGVzLgoKYGBge3J9CnNhbXBsZSA8LSBhdnRhYmxlKCJzYW1wbGUiKQpgYGAKCmBgYHtyfQpwaGVuX2d3YXNfc2FtcGxlcyA8LSBzYW1wbGUgJT4lCiAgc2VsZWN0KHNhbXBsZV9pZCwgc3ViamVjdF9pZCkgJT4lCiAgaW5uZXJfam9pbihwaGVuX2d3YXMpCmBgYAoKVGhlIEdFTkVTSVMgd29ya2Zsb3cgZXhwZWN0cyBhIHBoZW5vdHlwZSBmaWxlIGluIENTViBmb3JtYXQuIFdlIHNhdmUgdGhlCnBoZW5vdHlwZSBkYXRhIHRhYmxlIGFzIENTViBhbmQgY29weSBpdCB0byB0aGUgd29ya3NwYWNlIGJ1Y2tldC4KCmBgYHtyfQpvdXRmaWxlIDwtICJwaGVub3R5cGVzX2xkbF9nd2FzLmNzdiIKYnVja2V0X3BhdGggPC0gZmlsZS5wYXRoKGF2YnVja2V0KCksICJHV0FTIiwgb3V0ZmlsZSkKd3JpdGVfY3N2KHBoZW5fZ3dhc19zYW1wbGVzLCBvdXRmaWxlKQpnc3V0aWxfY3Aob3V0ZmlsZSwgYnVja2V0X3BhdGgpCmBgYAo=