tidyOhdsiSolutions

Lifecycle: experimental

tidyOhdsiSolutions is a lightweight R package of utilities for working with OMOP CDM data in the OHDSI ecosystem. It is intentionally dependency-light: the only hard runtime dependency beyond base R is jsonlite.

The package provides four main capabilities:

Area What it does
Functional helpers Base-R reimplementations of purrr functions (map, walk, imap, pluck, …) — no purrr dependency
Concept set builders Convert plain data.frames into CIRCE concept set expression lists
SQL generators Build SQL to resolve concept sets against an OMOP vocabulary schema — no Java / CirceR required
Cohort builders Create CirceR-compatible cohort definition objects programmatically

Installation

# install.packages("remotes")
remotes::install_github("<owner>/tidyOhdsiSolutions")

Usage

library(tidyOhdsiSolutions)

1 — Convert a data.frame to a concept set expression

concepts <- data.frame(
  concept_id       = c(201826L, 442793L),
  concept_name     = c("Type 2 diabetes mellitus", "Type 1 diabetes mellitus"),
  domain_id        = "Condition",
  vocabulary_id    = "SNOMED",
  concept_class_id = "Clinical Finding",
  standard_concept = "S",
  concept_code     = c("44054006", "46635009"),
  invalid_reason   = "V",
  excluded         = FALSE,
  descendants      = TRUE,
  mapped           = FALSE
)

cs_expr <- toConceptSet(concepts, name = "Diabetes")
str(cs_expr, max.level = 2)
#> List of 1
#>  $ items:List of 2
#>   ..$ :List of 4
#>   ..$ :List of 4

Multiple concept sets at once:

cs_list <- toConceptSets(
  list(
    diabetes     = concepts,
    hypertension = data.frame(concept_id = 316866L)
  )
)
names(cs_list)
#> [1] "diabetes"     "hypertension"

2 — Generate concept-set SQL

sql <- buildConceptSetQuery(cs_expr, vocabularyDatabaseSchema = "cdm")
cat(sql)
#> select distinct I.concept_id FROM
#> ( 
#>   select concept_id from cdm.CONCEPT where (concept_id in (201826,442793))
#> UNION
#>   select c.concept_id
#>   from cdm.CONCEPT c
#>   join cdm.CONCEPT_ANCESTOR ca on c.concept_id = ca.descendant_concept_id
#>   WHERE c.invalid_reason is null
#>   and (ca.ancestor_concept_id in (201826,442793))
#> ) I

Resolve multiple concept sets at once:

sql_list <- buildConceptSetQueries(cs_list, vocabularyDatabaseSchema = "cdm")

3 — Build a cohort definition (no Java / CirceR needed)

Single concept set

cohort <- createConceptSetCohort(
  conceptSetExpression = cs_expr,
  name                 = "Diabetes Cohort",
  limit                = "first",
  requiredObservation  = c(365L, 0L),
  end                  = "observation_period_end_date"
)

# Serialise to CirceR-compatible JSON
json <- cohortToJson(cohort)
cat(substr(json, 1, 300))
#> {
#>   "ConceptSets": [
#>     {
#>       "id": 0,
#>       "name": "Diabetes Cohort",
#>       "expression": {
#>         "items": [
#>           {
#>             "concept": {
#>               "CONCEPT_ID": 201826,
#>               "CONCEPT_NAME": "Type 2 diabetes mellitus",
#>               "STANDARD_CONCEPT": "S",
#> 

Multiple concept sets

cohortFromConceptSet() accepts a named list of concept set expressions and builds a single cohort with all of them:

drug_df <- data.frame(
  concept_id   = 1503297L,
  concept_name = "Metformin",
  domain_id    = "Drug",
  vocabulary_id = "RxNorm",
  standard_concept = "S",
  descendants  = TRUE
)

multi_cs <- toConceptSets(list(
  diabetes  = concepts,
  metformin = drug_df
))

multi_cohort <- cohortFromConceptSet(
  conceptSetList      = multi_cs,
  limit               = "earliest",
  requiredObservation = c(365L, 0L),
  end                 = "observation_period_end_date"
)

# Each concept set gets its own id
vapply(multi_cohort$ConceptSets, `[[`, character(1), "name")
#> [1] "diabetes"  "metformin"

End-strategy variants

# Continuous drug era
cohort_drug <- createConceptSetCohort(
  cs_expr,
  end     = "drug_exit",
  endArgs = list(persistenceWindow = 30, surveillanceWindow = 0)
)

# Fixed offset from index
cohort_fixed <- createConceptSetCohort(
  cs_expr,
  end     = "fixed_exit",
  endArgs = list(index = "startDate", offsetDays = 365)
)

4 — Extract concept sets from an existing cohort definition

# cohort_def is a list produced by e.g. CirceR::cohortExpressionFromJson()
concept_sets <- collectCsFromCohort(cohort_def)
# Returns a named list keyed by lowerCamelCase concept set names

5 — Functional helpers (purrr-compatible, no purrr)

# map / map_chr / map_dbl / map_int / map_lgl
tidyOhdsiSolutions:::map(1:4, ~ .x^2)
#> [[1]]
#> [1] 1
#> 
#> [[2]]
#> [1] 4
#> 
#> [[3]]
#> [1] 9
#> 
#> [[4]]
#> [1] 16

# map2
tidyOhdsiSolutions:::map2_chr(c("hello", "foo"), c("world", "bar"), paste)
#>         hello           foo 
#> "hello world"     "foo bar"

# pluck — safely extract from nested structures
nested <- list(a = list(b = list(c = 42)))
tidyOhdsiSolutions:::pluck(nested, "a", "b", "c")
#> [1] 42
tidyOhdsiSolutions:::pluck(nested, "a", "missing", .default = 0)
#> [1] 0

# walk — side-effects only, returns .x invisibly
tidyOhdsiSolutions:::walk(1:3, ~ message("item ", .x))
#> item 1
#> item 2
#> item 3

# imap — index-aware map
tidyOhdsiSolutions:::imap(c(a = 10, b = 20), ~ paste(.y, "=", .x))
#> $a
#> [1] "a = 10"
#> 
#> $b
#> [1] "b = 20"

Supported OMOP domains

createConceptSetCohort(), cohortFromConceptSet(), and buildConceptSetQuery() support the following domains:

Condition, Drug, Procedure, Observation, Measurement, Visit, Device

Key design decisions