This notebook demonstrates how to prepare a phenotype file for use with the primed_genesis_gwas workflow.

Load R libraries:

library(AnVIL)
library(dplyr)
library(readr)
library(tidyr)
library(stringr)

Phenotype file paths are stored in the “phenotype_harmonized” data table in a PRIMED workspace. We use the avtable function to read this table, and find the file paths for the phenotype files we need for this analysis. In this example, we are using a simulated LDL phenotype found in the “pilot” phenotype file.

We also need the population_descriptor phenotype file, as we will use “population” as a covariate in the analysis. For each table, we copy the file from the google bucket to our compute instance and read it into R.

pheno_tables <- avtable("phenotype_harmonized")
read_phen_table <- function(table_name) {
  phen_file <- pheno_tables %>%
    filter(domain == table_name) %>%
    select(file_path) %>%
    unlist()
  gsutil_cp(phen_file, ".")
  read_tsv(basename(phen_file))
}
subj <- avtable("subject")

The population_descriptor data model allows multiple descriptors for each subject, with a delimiter. Below is a function that extracts a single descriptor and corresponding labels from the table.

select_pop_desc <- function(pop_desc_table, descriptor) {
  dat <- pop_desc_table %>%
    separate_longer_delim(starts_with("population"), delim="|") %>%
    mutate(across(starts_with("population"), str_trim)) %>%
    pivot_wider(names_from=population_descriptor, values_from=population_label) %>%
    select(subject_id, all_of(descriptor))
}

We select the outcome and covariates we plan to use in the GWAS. The GENESIS workflow expects a column called ‘sex’ with values ‘M’ and ‘F’, so these values must be mapped from the PRIMED data model.

phen_gwas <- subj %>%
  mutate(sex=c("Male"="M", "Female"="F")[reported_sex]) %>%
  inner_join(phen) %>%
  select(subject_id, sex, age_at_observation, ldl) %>%
  inner_join(select_pop_desc(pop, "population"))

In the PRIMED data model, the sample table links subject identifiers (subject_id) to the sample identifiers in the genotype files (sample_id). The GENESIS workflow requires sample_id to match phenotypes to genotypes.

sample <- avtable("sample")
phen_gwas_samples <- sample %>%
  select(sample_id, subject_id) %>%
  inner_join(phen_gwas)

The GENESIS workflow expects a phenotype file in CSV format. We save the phenotype data table as CSV and copy it to the workspace bucket.

outfile <- "phenotypes_ldl_gwas.csv"
bucket_path <- file.path(avbucket(), "GWAS", outfile)
write_csv(phen_gwas_samples, outfile)
gsutil_cp(outfile, bucket_path)
LS0tCnRpdGxlOiAiUHJlcGFyZSBwaGVub3R5cGUgZmlsZSBmb3IgR1dBUyIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKVGhpcyBub3RlYm9vayBkZW1vbnN0cmF0ZXMgaG93IHRvIHByZXBhcmUgYSBwaGVub3R5cGUgZmlsZSBmb3IgdXNlIHdpdGggdGhlCltwcmltZWRfZ2VuZXNpc19nd2FzXShodHRwczovL2dpdGh1Yi5jb20vVVctR0FDL3ByaW1lZC1nZW5lc2lzLWd3YXMpIHdvcmtmbG93LiAKCkxvYWQgUiBsaWJyYXJpZXM6CgpgYGB7cn0KbGlicmFyeShBblZJTCkKbGlicmFyeShkcGx5cikKbGlicmFyeShyZWFkcikKbGlicmFyeSh0aWR5cikKbGlicmFyeShzdHJpbmdyKQpgYGAKClBoZW5vdHlwZSBmaWxlIHBhdGhzIGFyZSBzdG9yZWQgaW4gdGhlICJwaGVub3R5cGVfaGFybW9uaXplZCIgZGF0YSB0YWJsZSBpbiBhIApQUklNRUQgd29ya3NwYWNlLiBXZSB1c2UgdGhlIGBhdnRhYmxlYCBmdW5jdGlvbiB0byByZWFkIHRoaXMgdGFibGUsIGFuZCBmaW5kIAp0aGUgZmlsZSBwYXRocyBmb3IgdGhlIHBoZW5vdHlwZSBmaWxlcyB3ZSBuZWVkIGZvciB0aGlzIGFuYWx5c2lzLiBJbiB0aGlzIGV4YW1wbGUsCndlIGFyZSB1c2luZyBhIHNpbXVsYXRlZCBMREwgcGhlbm90eXBlIGZvdW5kIGluIHRoZSAicGlsb3QiIHBoZW5vdHlwZSBmaWxlLgoKV2UgYWxzbyBuZWVkIHRoZSBwb3B1bGF0aW9uX2Rlc2NyaXB0b3IgcGhlbm90eXBlIGZpbGUsIGFzIHdlIHdpbGwgdXNlICJwb3B1bGF0aW9uIgphcyBhIGNvdmFyaWF0ZSBpbiB0aGUgYW5hbHlzaXMuIEZvciBlYWNoIHRhYmxlLCB3ZSBjb3B5IHRoZSBmaWxlIGZyb20gdGhlIGdvb2dsZQpidWNrZXQgdG8gb3VyIGNvbXB1dGUgaW5zdGFuY2UgYW5kIHJlYWQgaXQgaW50byBSLgoKYGBge3J9CnBoZW5vX3RhYmxlcyA8LSBhdnRhYmxlKCJwaGVub3R5cGVfaGFybW9uaXplZCIpCnJlYWRfcGhlbl90YWJsZSA8LSBmdW5jdGlvbih0YWJsZV9uYW1lKSB7CiAgcGhlbl9maWxlIDwtIHBoZW5vX3RhYmxlcyAlPiUKICAgIGZpbHRlcihkb21haW4gPT0gdGFibGVfbmFtZSkgJT4lCiAgICBzZWxlY3QoZmlsZV9wYXRoKSAlPiUKICAgIHVubGlzdCgpCiAgZ3N1dGlsX2NwKHBoZW5fZmlsZSwgIi4iKQogIHJlYWRfdHN2KGJhc2VuYW1lKHBoZW5fZmlsZSkpCn0KYGBgCgpgYGB7cn0Kc3ViaiA8LSBhdnRhYmxlKCJzdWJqZWN0IikKcGhlbiA8LSByZWFkX3BoZW5fdGFibGUoInBpbG90IikKcG9wIDwtIHJlYWRfcGhlbl90YWJsZSgicG9wdWxhdGlvbl9kZXNjcmlwdG9yIikKYGBgCgpUaGUgcG9wdWxhdGlvbl9kZXNjcmlwdG9yIFtkYXRhIG1vZGVsXShodHRwczovL2dpdGh1Yi5jb20vVVctR0FDL3ByaW1lZF9kYXRhX21vZGVscy9ibG9iL21haW4vUFJJTUVEX3BoZW5vdHlwZV9kYXRhX21vZGVsLmpzb24jTDU4KQphbGxvd3MgbXVsdGlwbGUgZGVzY3JpcHRvcnMgZm9yIGVhY2ggc3ViamVjdCwgd2l0aCBhIGRlbGltaXRlci4gQmVsb3cgaXMgYSBmdW5jdGlvbgp0aGF0IGV4dHJhY3RzIGEgc2luZ2xlIGRlc2NyaXB0b3IgYW5kIGNvcnJlc3BvbmRpbmcgbGFiZWxzIGZyb20gdGhlIHRhYmxlLgoKYGBge3J9CnNlbGVjdF9wb3BfZGVzYyA8LSBmdW5jdGlvbihwb3BfZGVzY190YWJsZSwgZGVzY3JpcHRvcikgewogIGRhdCA8LSBwb3BfZGVzY190YWJsZSAlPiUKICAgIHNlcGFyYXRlX2xvbmdlcl9kZWxpbShzdGFydHNfd2l0aCgicG9wdWxhdGlvbiIpLCBkZWxpbT0ifCIpICU+JQogICAgbXV0YXRlKGFjcm9zcyhzdGFydHNfd2l0aCgicG9wdWxhdGlvbiIpLCBzdHJfdHJpbSkpICU+JQogICAgcGl2b3Rfd2lkZXIobmFtZXNfZnJvbT1wb3B1bGF0aW9uX2Rlc2NyaXB0b3IsIHZhbHVlc19mcm9tPXBvcHVsYXRpb25fbGFiZWwpICU+JQogICAgc2VsZWN0KHN1YmplY3RfaWQsIGFsbF9vZihkZXNjcmlwdG9yKSkKfQpgYGAKCldlIHNlbGVjdCB0aGUgb3V0Y29tZSBhbmQgY292YXJpYXRlcyB3ZSBwbGFuIHRvIHVzZSBpbiB0aGUgR1dBUy4gVGhlIEdFTkVTSVMgd29ya2Zsb3cKZXhwZWN0cyBhIGNvbHVtbiBjYWxsZWQgJ3NleCcgd2l0aCB2YWx1ZXMgJ00nIGFuZCAnRicsIHNvIHRoZXNlIHZhbHVlcyBtdXN0IGJlIAptYXBwZWQgZnJvbSB0aGUgUFJJTUVEIGRhdGEgbW9kZWwuCgpgYGB7cn0KcGhlbl9nd2FzIDwtIHN1YmogJT4lCiAgbXV0YXRlKHNleD1jKCJNYWxlIj0iTSIsICJGZW1hbGUiPSJGIilbcmVwb3J0ZWRfc2V4XSkgJT4lCiAgaW5uZXJfam9pbihwaGVuKSAlPiUKICBzZWxlY3Qoc3ViamVjdF9pZCwgc2V4LCBhZ2VfYXRfb2JzZXJ2YXRpb24sIGxkbCkgJT4lCiAgaW5uZXJfam9pbihzZWxlY3RfcG9wX2Rlc2MocG9wLCAicG9wdWxhdGlvbiIpKQpgYGAKCkluIHRoZSBQUklNRUQgZGF0YSBtb2RlbCwgdGhlIHNhbXBsZSB0YWJsZSBsaW5rcyBzdWJqZWN0IGlkZW50aWZpZXJzIChzdWJqZWN0X2lkKQp0byB0aGUgc2FtcGxlIGlkZW50aWZpZXJzIGluIHRoZSBnZW5vdHlwZSBmaWxlcyAoc2FtcGxlX2lkKS4gVGhlIEdFTkVTSVMgd29ya2Zsb3cKcmVxdWlyZXMgc2FtcGxlX2lkIHRvIG1hdGNoIHBoZW5vdHlwZXMgdG8gZ2Vub3R5cGVzLgoKYGBge3J9CnNhbXBsZSA8LSBhdnRhYmxlKCJzYW1wbGUiKQpgYGAKCmBgYHtyfQpwaGVuX2d3YXNfc2FtcGxlcyA8LSBzYW1wbGUgJT4lCiAgc2VsZWN0KHNhbXBsZV9pZCwgc3ViamVjdF9pZCkgJT4lCiAgaW5uZXJfam9pbihwaGVuX2d3YXMpCmBgYAoKVGhlIEdFTkVTSVMgd29ya2Zsb3cgZXhwZWN0cyBhIHBoZW5vdHlwZSBmaWxlIGluIENTViBmb3JtYXQuIFdlIHNhdmUgdGhlCnBoZW5vdHlwZSBkYXRhIHRhYmxlIGFzIENTViBhbmQgY29weSBpdCB0byB0aGUgd29ya3NwYWNlIGJ1Y2tldC4KCmBgYHtyfQpvdXRmaWxlIDwtICJwaGVub3R5cGVzX2xkbF9nd2FzLmNzdiIKYnVja2V0X3BhdGggPC0gZmlsZS5wYXRoKGF2YnVja2V0KCksICJHV0FTIiwgb3V0ZmlsZSkKd3JpdGVfY3N2KHBoZW5fZ3dhc19zYW1wbGVzLCBvdXRmaWxlKQpnc3V0aWxfY3Aob3V0ZmlsZSwgYnVja2V0X3BhdGgpCmBgYAo=