Behind the scenes • CohortConstructor

In previous vignettes we have seen numerous R functions that can help us to add a cohort of interest to a cdm reference. When our cdm reference is to tables in a database, as is often the code, our R code will have been translated to SQL that is run against tables in the databases (for more details on how this is all implemented see https://oxford-pharmacoepi.github.io/Tidy-R-programming-with-OMOP/).

Let’s again work with Eunomia and get the codes needed to create a set of drug cohorts.

library(CDMConnector)
library(CodelistGenerator)
library(PatientProfiles)
library(CohortConstructor)
library(dplyr)

con <- DBI::dbConnect(duckdb::duckdb(), 
                      dbdir = eunomiaDir())
cdm <- cdmFromCon(con, cdmSchema = "main", writeSchema = "main", 
                  writePrefix = "my_study_")
drug_codes <- getDrugIngredientCodes(cdm, 
                                     name = c("acetaminophen",
                                              "amoxicillin", 
                                              "diclofenac", 
                                              "simvastatin",
                                              "warfarin"))

To capture all the SQL executed as we use CohortConstructor functions we can set a global option. For this, we just need to point to a directory in which we’ll save each SQL statement run behind the scenes. Note that in this example we’re using duckdb so the SQL is for this database management system. If you were running on a different type of database the SQL would be adapted accordingly.

dir_sql <- file.path(tempdir(), "sql_folder")
dir.create(dir_sql)
options("omopgenerics.log_sql_path" = dir_sql)

cdm$drugs <- conceptCohort(cdm, 
                           conceptSet = drug_codes,
                           exit = "event_end_date",
                           name = "drugs")

# print sql in order they were saved
files <- file.info(list.files(dir_sql, full.names = TRUE))
sorted_files <- rownames(files[order(files$ctime),])
for(i in seq_along(sorted_files)) {
  cat(paste0("### ", sorted_files[i], "\n\n"))
  sql_with_quotes <- paste0('"', paste(readLines(sorted_files[i]), collapse = '\n'), '"')
  cat(sql_with_quotes, "\n```\n\n")
}
#> ### /tmp/RtmpE7oKl6/sql_folder/CohortConstructor_uploadCohortCodelistlogged_query_ran_on_2025_04_01_at_13_10_27.sql
#> 
#> "<SQL>
#> SELECT
#>   cohort_definition_id,
#>   CAST(concept_id AS INTEGER) AS concept_id,
#>   LOWER(domain_id) AS domain_id
#> FROM (
#>   SELECT my_study_tmp_001_og_004_1743513027.*, domain_id
#>   FROM my_study_tmp_001_og_004_1743513027
#>   LEFT JOIN concept
#>     ON (my_study_tmp_001_og_004_1743513027.concept_id = concept.concept_id)
#> ) q01" 
#> ```
#> 
#> ### /tmp/RtmpE7oKl6/sql_folder/CohortConstructor_tempCohort_logged_query_ran_on_2025_04_01_at_13_10_27.sql
#> 
#> "<SQL>
#> SELECT
#>   person_id AS subject_id,
#>   drug_concept_id AS concept_id,
#>   drug_exposure_start_date AS cohort_start_date,
#>   drug_exposure_end_date AS cohort_end_date,
#>   cohort_definition_id
#> FROM drug_exposure
#> INNER JOIN (
#>   SELECT concept_id, cohort_definition_id
#>   FROM my_study_tmp_001_og_004_1743513027
#>   WHERE (domain_id IN ('drug'))
#> ) RHS
#>   ON (drug_exposure.drug_concept_id = RHS.concept_id)" 
#> ```
#> 
#> ### /tmp/RtmpE7oKl6/sql_folder/CohortConstructor_conceptCohort_reduce_logged_query_ran_on_2025_04_01_at_13_10_27.sql
#> 
#> "<SQL>
#> SELECT
#>   cohort_definition_id,
#>   subject_id,
#>   cohort_start_date,
#>   COALESCE(cohort_end_date, cohort_start_date) AS cohort_end_date
#> FROM my_study_og_005_1743513027_1" 
#> ```
#> 
#> ### /tmp/RtmpE7oKl6/sql_folder/CohortConstructor_fulfillCohortReqs_filterStartEnd_logged_query_ran_on_2025_04_01_at_13_10_28.sql
#> 
#> "<SQL>
#> SELECT my_study_drugs.*
#> FROM my_study_drugs
#> WHERE
#>   (NOT((cohort_start_date IS NULL))) AND
#>   (cohort_start_date <= cohort_end_date)" 
#> ```
#> 
#> ### /tmp/RtmpE7oKl6/sql_folder/CohortConstructor_fulfillCohortReqs_observationJoin_logged_query_ran_on_2025_04_01_at_13_10_29.sql
#> 
#> "<SQL>
#> SELECT
#>   my_study_drugs.*,
#>   observation_period_id,
#>   observation_period_start_date,
#>   observation_period_end_date
#> FROM my_study_drugs
#> LEFT JOIN observation_period
#>   ON (my_study_drugs.subject_id = observation_period.person_id)" 
#> ```
#> 
#> ### /tmp/RtmpE7oKl6/sql_folder/CohortConstructor_fulfillCohortReqs_inObservation_logged_query_ran_on_2025_04_01_at_13_10_29.sql
#> 
#> "<SQL>
#> SELECT
#>   cohort_definition_id,
#>   subject_id,
#>   cohort_start_date,
#>   CASE WHEN (observation_period_end_date >= cohort_end_date) THEN cohort_end_date WHEN NOT (observation_period_end_date >= cohort_end_date) THEN observation_period_end_date END AS cohort_end_date
#> FROM my_study_drugs
#> WHERE
#>   (cohort_start_date >= observation_period_start_date) AND
#>   (cohort_start_date <= observation_period_end_date)" 
#> ```
#> 
#> ### /tmp/RtmpE7oKl6/sql_folder/CohortConstructor_joinOverlap_workingTbl_logged_query_ran_on_2025_04_01_at_13_10_30.sql
#> 
#> "<SQL>
#> SELECT q01.*, -1.0 AS date_id
#> FROM (
#>   SELECT cohort_definition_id, subject_id, cohort_start_date AS date
#>   FROM my_study_drugs
#> ) q01
#> 
#> UNION ALL
#> 
#> SELECT q01.*, 1.0 AS date_id
#> FROM (
#>   SELECT cohort_definition_id, subject_id, cohort_end_date AS date
#>   FROM my_study_drugs
#> ) q01" 
#> ```
#> 
#> ### /tmp/RtmpE7oKl6/sql_folder/CohortConstructor_joinOverlap_ids_logged_query_ran_on_2025_04_01_at_13_10_30.sql
#> 
#> "<SQL>
#> SELECT
#>   cohort_definition_id,
#>   subject_id,
#>   SUM(CAST(era_id AS NUMERIC)) OVER (PARTITION BY cohort_definition_id, subject_id ORDER BY date, date_id ROWS UNBOUNDED PRECEDING) AS era_id,
#>   "name",
#>   date
#> FROM (
#>   SELECT
#>     my_study_og_006_1743513030.*,
#>     SUM(date_id) OVER (PARTITION BY cohort_definition_id, subject_id ORDER BY date, date_id ROWS UNBOUNDED PRECEDING) AS cum_id,
#>     CASE WHEN (date_id = -1.0) THEN 'cohort_start_date' WHEN NOT (date_id = -1.0) THEN 'cohort_end_date' END AS "name",
#>     CASE WHEN (date_id = -1.0) THEN 1.0 WHEN NOT (date_id = -1.0) THEN 0.0 END AS era_id
#>   FROM my_study_og_006_1743513030
#> ) q01
#> WHERE (cum_id = 0.0 OR (cum_id = -1.0 AND date_id = -1.0))" 
#> ```
#> 
#> ### /tmp/RtmpE7oKl6/sql_folder/CohortConstructor_joinOverlap_pivot_wider_logged_query_ran_on_2025_04_01_at_13_10_30.sql
#> 
#> "<SQL>
#> SELECT
#>   cohort_definition_id,
#>   subject_id,
#>   MAX(CASE WHEN ("name" = 'cohort_end_date') THEN date END) AS cohort_end_date,
#>   MAX(CASE WHEN ("name" = 'cohort_start_date') THEN date END) AS cohort_start_date
#> FROM my_study_og_006_1743513030
#> GROUP BY cohort_definition_id, subject_id, era_id" 
#> ```
#> 
#> ### /tmp/RtmpE7oKl6/sql_folder/CohortConstructor_joinOverlap_relocate_logged_query_ran_on_2025_04_01_at_13_10_30.sql
#> 
#> "<SQL>
#> SELECT DISTINCT
#>   cohort_definition_id,
#>   subject_id,
#>   cohort_start_date,
#>   cohort_end_date
#> FROM my_study_og_006_1743513030" 
#> ```

If we want even more detail, we also have the option to see the execution plan along with the SQL.

dir_explain <- file.path(tempdir(), "explain_folder")
dir.create(dir_explain)
options("omopgenerics.log_sql_explain_path" = dir_explain)

cdm$drugs <- cdm$drugs |> 
  requireIsFirstEntry()

files <- list.files(dir_explain, full.names = TRUE)
file_names <- list.files(dir_explain, full.names = FALSE)

for(i in seq_along(files)) {
  cat(paste0("### ", file_names[i], "\n\n"))
  sql_with_quotes <- paste0('"', paste(readLines(files[i]), collapse = '\n'), '"')
  cat(sql_with_quotes, "\n```\n\n")
}
#> ### CohortConstructor_requireIsFirstEntry_min_logged_query_ran_on_2025_04_01_at_13_10_32.sql
#> 
#> "<SQL>
#> SELECT cohort_definition_id, subject_id, cohort_start_date, cohort_end_date
#> FROM (
#>   SELECT
#>     my_study_drugs.*,
#>     MIN(cohort_start_date) OVER (PARTITION BY subject_id, cohort_definition_id) AS col01
#>   FROM my_study_drugs
#> ) q01
#> WHERE (cohort_start_date = col01)
#> 
#> <PLAN>
#> physical_plan
#> ┌───────────────────────────┐
#> │         PROJECTION        │
#> │    ────────────────────   │
#> │             #0            │
#> │             #1            │
#> │             #2            │
#> │             #3            │
#> │                           │
#> │         ~3872 Rows        │
#> └─────────────┬─────────────┘
#> ┌─────────────┴─────────────┐
#> │           FILTER          │
#> │    ────────────────────   │
#> │(cohort_start_date = col01)│
#> │                           │
#> │         ~3872 Rows        │
#> └─────────────┬─────────────┘
#> ┌─────────────┴─────────────┐
#> │         PROJECTION        │
#> │    ────────────────────   │
#> │             #0            │
#> │             #1            │
#> │             #2            │
#> │             #3            │
#> │             #4            │
#> │                           │
#> │        ~19364 Rows        │
#> └─────────────┬─────────────┘
#> ┌─────────────┴─────────────┐
#> │           WINDOW          │
#> │    ────────────────────   │
#> │        Projections:       │
#> │   min(cohort_start_date)  │
#> │     OVER (PARTITION BY    │
#> │         subject_id,       │
#> │    cohort_definition_id)  │
#> └─────────────┬─────────────┘
#> ┌─────────────┴─────────────┐
#> │         SEQ_SCAN          │
#> │    ────────────────────   │
#> │   Table: my_study_drugs   │
#> │   Type: Sequential Scan   │
#> │                           │
#> │        Projections:       │
#> │    cohort_definition_id   │
#> │         subject_id        │
#> │     cohort_start_date     │
#> │      cohort_end_date      │
#> │                           │
#> │        ~19364 Rows        │
#> └───────────────────────────┘" 
#> ```