From 62d16f428e2084817fd0b26df1efa652849c896e Mon Sep 17 00:00:00 2001 From: Michael Terry Date: Tue, 27 Feb 2024 11:54:28 -0300 Subject: [PATCH] docs: some wordsmithing and clarification --- cumulus_library/cli_parser.py | 2 +- cumulus_library/study_parser.py | 5 +- docs/aws-setup.md | 8 +- docs/core-study-details.md | 86 ++++++++++--------- docs/creating-sql-with-python.md | 67 +++++++-------- docs/creating-studies.md | 109 +++++++++++++------------ docs/first-time-setup.md | 26 +++--- docs/sharing-data.md | 15 ++-- docs/study-list.md | 3 +- docs/third-party-software-citations.md | 2 +- 10 files changed, 171 insertions(+), 152 deletions(-) diff --git a/cumulus_library/cli_parser.py b/cumulus_library/cli_parser.py index 7ecdddef..5e8fe82d 100644 --- a/cumulus_library/cli_parser.py +++ b/cumulus_library/cli_parser.py @@ -22,7 +22,7 @@ def add_table_builder_argument(parser: argparse.ArgumentParser) -> None: def add_study_dir_argument(parser: argparse.ArgumentParser) -> None: - """Adds --study_dir arg to a subparser""" + """Adds --study-dir arg to a subparser""" parser.add_argument( "-s", "--study-dir", diff --git a/cumulus_library/study_parser.py b/cumulus_library/study_parser.py index 7b1a4aa7..bf719e3d 100644 --- a/cumulus_library/study_parser.py +++ b/cumulus_library/study_parser.py @@ -248,10 +248,7 @@ def clean_study( # study builder, and remove them from the list. for view_table in view_table_list.copy(): if any( - ( - (f"_{word.value}_") in view_table[0] - or view_table[0].endswith(word.value) - ) + f"__{word.value}_" in view_table[0] for word in enums.ProtectedTableKeywords ): view_table_list.remove(view_table) diff --git a/docs/aws-setup.md b/docs/aws-setup.md index 009029c8..b1ede922 100644 --- a/docs/aws-setup.md +++ b/docs/aws-setup.md @@ -14,7 +14,7 @@ Cumulus library executes queries against an for creating such a datastore is available for testing purposes if you don't already have one. -The cloudformation template in the sample database's Cloudformation template should +The sample database's CloudFormation template should have the appropriate permissions set for all the services. If you need to configure an IAM policy manually, you will need to ensure the AWS profile you are using has the following permissions: @@ -52,4 +52,8 @@ to specify where your database information lives: - `CUMULUS_LIBRARY_DATABASE` : The name of the database Athena will use (`cumulus_library_sample_db` if using the sample DB) - `CUMULUS_LIBRARY_WORKGROUP` : the Athena workgroup to execute queries in (`cumulus_library_sample_db` if using the sample DB) -Configuring environment variables on your system is out of scope of this document, but several guides are available elsewhere. [This guide](https://www.twilio.com/blog/2017/01/how-to-set-environment-variables.html), for example, covers Mac, Windows, and Linux. And, as a plus, it has a picture of an adorable puppy at the top of it. \ No newline at end of file +Configuring environment variables on your system is out of scope of this document, +but several guides are available elsewhere. +[This guide](https://www.twilio.com/blog/how-to-set-environment-variables-html), +for example, covers Mac, Windows, and Linux. +And, as a plus, it has a picture of an adorable puppy at the top of it. \ No newline at end of file diff --git a/docs/core-study-details.md b/docs/core-study-details.md index e272c53d..b577bc93 100644 --- a/docs/core-study-details.md +++ b/docs/core-study-details.md @@ -42,58 +42,64 @@ Examples: ## Core study exportable counts tables ### count_core_condition_icd10_month -| Variable | Description | -| -------- | -------- | -| cnt | Count | -| cond_month | Month condition recorded | -| cond_code_display | Condition code | -| enc_class_code | Encounter Code (Healthcare Setting) | + +| Variable | Description | +|:------------------|:------------------------------------| +| cnt | Count | +| cond_month | Month condition recorded | +| cond_code_display | Condition code | +| enc_class_code | Encounter Code (Healthcare Setting) | ### count_core_documentreference_month -| Variable | Description | -| -------- | -------- | -| cnt | Count | -| author_month | Month document was authored | -| enc_class_code | Encounter Code (Healthcare Setting) | -| doc_type_display | Type of Document (display) | + +| Variable | Description | +|:-----------------|:------------------------------------| +| cnt | Count | +| author_month | Month document was authored | +| enc_class_code | Encounter Code (Healthcare Setting) | +| doc_type_display | Type of Document (display) | ### count_core_encounter_day -| Variable | Description | -| -------- | -------- | -| cnt | Count | -| enc_class_code | Encounter Code (Healthcare Setting) | -| start_date | Day patient encounter started | + +| Variable | Description | +|:---------------|:------------------------------------| +| cnt | Count | +| enc_class_code | Encounter Code (Healthcare Setting) | +| start_date | Day patient encounter started | ### count_core_encounter_month -| Variable | Description | -| -------- | -------- | -| cnt | Count | -| enc_class_code | Encounter Code (Healthcare Setting) | -| start_month | Month patient encounter started | -| age_at_visit | Patient Age at Encounter | -| gender | Biological sex at birth | -| race_display | Patient reported race | -| postalcode3 | Patient 3 digit zip | + +| Variable | Description | +|:---------------|:------------------------------------| +| cnt | Count | +| enc_class_code | Encounter Code (Healthcare Setting) | +| start_month | Month patient encounter started | +| age_at_visit | Patient Age at Encounter | +| gender | Biological sex at birth | +| race_display | Patient reported race | +| postalcode3 | Patient 3 digit zip | ### count_core_observation_lab_month -| Variable | Description | -| -------- | -------- | -| cnt | Count | -| lab_month | Month of lab result | -| lab_code | Laboratory Code | -| lab_result_display | Laboratory result | -| enc_class_code | Encounter Code (Healthcare Setting) | + +| Variable | Description | +|:-------------------|:------------------------------------| +| cnt | Count | +| lab_month | Month of lab result | +| lab_code | Laboratory Code | +| lab_result_display | Laboratory result | +| enc_class_code | Encounter Code (Healthcare Setting) | ### count_core_patient -| Variable | Description | -| -------- | -------- | -| cnt | Count | -| gender | Biological sex at birth | -| age | Age in years calculated since DOB | -| race_display | Patient reported race | -| postalcode3 | Patient 3 digit zip | + +| Variable | Description | +|:-------------|:----------------------------------| +| cnt | Count | +| gender | Biological sex at birth | +| age | Age in years calculated since DOB | +| race_display | Patient reported race | +| postalcode3 | Patient 3 digit zip | diff --git a/docs/creating-sql-with-python.md b/docs/creating-sql-with-python.md index 1e98a06a..918549dd 100644 --- a/docs/creating-sql-with-python.md +++ b/docs/creating-sql-with-python.md @@ -19,28 +19,30 @@ sections. ## Why would I even need to think about this? -There are three main reasons why you would need to use python to generate sql: +There are three main reasons why you would need to use Python to generate SQL: - You would like to make use of the [helper class we've built](#generating-counts-tables) -for ease of creating count tables in a structured manual. +for ease of creating count tables in a structured manner. - You have a dataset you'd like to [load into a table from a static file](#adding-a-static-dataset), separate from the ETL tables. - The gnarly one: you are working against the raw FHIR resource tables, and are trying to access [nested data](#querying-nested-data) in Athena. - - We infer datatypes in the ETL based on the presence of data once we get past - the top level elements, and so the structure may vary depending on the - implementation, either at the EHR level or at the FHIR interface level. + - This is gnarly because while the ETL provides a full SQL schema for your own data, + it does not guarantee a schema for data that you don't have at your site. + And if you want your study to run at multiple sites with different EHRs, + you need to be careful when accessing deep FHIR fields. + For example, your EHR might populate `Condition.evidence.code` and you can safely + write SQL that uses it. But a different site's EHR may not provide that field at all, + and thus that column may not be defined in the SQL table schema at that other site. - -We've got examples of all three of these cases in this repo, and we'll reference -those as examples as we go. +You'll see examples of all three cases in this guide. ## Utilities There are two main bits of infrastructure we use for programmatic tables: -The TableBuilder class, and the collection of template SQL. +The `TableBuilder` class, and the collection of template SQL. ### Working with TableBuilders @@ -63,7 +65,7 @@ queries are being executed. You can either extend this class directly (like `builder_*.py` files in `cumulus_library/studies/core`) or create a specific class to add reusable functions -for a repeated use case (like in `cumulus_library/schema/counts.py`). +for a repeated use case (like in `cumulus_library/statistics/counts.py`). TableBuilder SQL generally should go through a template SQL generator, so that your SQL has been validated. If you're just working on counts, you don't need @@ -77,27 +79,27 @@ we've got enough wrappers that you shouldn't need to worry about this level of detail. For validating SQL, we are using -[Jinja templates](https://jinja.palletsprojects.com/en/3.1.x/) +[Jinja templates](https://jinja.palletsprojects.com/) to create validated SQL in a repeatable manner. We don't expect you to write these templates - instead, using the -[template function library](../cumulus_library/template_sql/templates.py) -you can provide a series of arguments to these templates that will allow you to +[template function library](https://github.com/smart-on-fhir/cumulus-library/blob/main/cumulus_library/template_sql/base_templates.py) +you can provide arguments to these templates that will allow you to generate standard types of SQL tables, as well as using templates targeted for bespoke operations. When you're thinking about a query that you'd need to create, first check the -template function library to see if something already exists. Basic CRUD -should be covered, as well as unnestings for some common FHIR objects. +template function library to see if something already exists. Basic creation and inspection +queries should be covered, as well as unnestings for some common FHIR objects. ## Use cases ### Generating counts tables A thing we do over and over as part of studies is generate powerset counts tables against a filtered resource to get data about a certain kind of clinical population. -Since this is so common we created a class just for this, and we're using it in all +Since this is so common, we created a class just for this, and we're using it in all studies the Cumulus team is directly authoring. -The [CountsBuilder class](https://github.com/smart-on-fhir/cumulus-library/blob/main/cumulus_library/schema/counts.py) +The [CountsBuilder class](https://github.com/smart-on-fhir/cumulus-library/blob/main/cumulus_library/statistics/counts.py) provides a number of convenience methods that are available for use (this covers mechanics of generation). You can see examples of usage in the [Core counts builder](https://github.com/smart-on-fhir/cumulus-library/blob/main//cumulus_library/studies/core/count_core.py) @@ -119,7 +121,7 @@ As a convenience, if you include a `if __name__ == "__main__":` clause like you see in `count_core.py`, you can invoke the builder's output by invoking it with python, which is a nice way to get example SQL output for inclusion in github. This is where the -[count core sql output](https://github.com/smart-on-fhir/cumulus-library/blob/main//cumulus_library/studies/core/count_core.sql) +[count core sql output](https://github.com/smart-on-fhir/cumulus-library/blob/main/cumulus_library/studies/core/reference_sql/count_core.sql) originated from. Add your count generator file to the `counts_builder_config` section of your @@ -127,20 +129,16 @@ Add your count generator file to the `counts_builder_config` section of your ### Adding a static dataset -*NOTE* - we have an -[open issue](https://github.com/smart-on-fhir/cumulus-library/issues/58) -to develop a faster methodology for adding new datasets. - Occasionally you will have a dataset from a third party that is useful for working with your dataset. In the vocab study (requiring a license to use), we [add coding system data](https://github.com/smart-on-fhir/cumulus-library/blob/main//cumulus_library/studies/vocab/vocab_icd_builder.py) -from flat files to athena. If you need to do this, you should extend the base -TableBuilder class, and your `prepare_queries` function should do the following, +from flat files to Athena. If you need to do this, you should extend the base +`TableBuilder` class, and your `prepare_queries` function should do the following, leveraging the -[template function library](https://github.com/smart-on-fhir/cumulus-library/blob/main//cumulus_library/template_sql/templates.py): -- Use the `get_ctas_query` function to get a CREATE TABLE AS statement to -instantiate your table in athena -- Since athena SQL queries are limited in size to 262144 bytes, if you have +[template function library](https://github.com/smart-on-fhir/cumulus-library/blob/main/cumulus_library/template_sql/base_templates.py): +- Use the `get_ctas_query` function to get a `CREATE TABLE AS` statement to +instantiate your table in Athena +- Since Athena SQL queries are limited in size to 262144 bytes, if you have a large dataset, break it up into smaller chunks - Use the `get_insert_into` function to add the data from each table to the chunk you just created. @@ -149,6 +147,11 @@ Add the dataset uploader to the `table_builder_config` section of your `manifest.toml` to include it in your build - this will make this data available for downstream queries +{: .note } +We have an +[open issue](https://github.com/smart-on-fhir/cumulus-library/issues/58) +to develop an easier methodology for adding new datasets. + ### Querying nested data Are you trying to access data from deep within raw FHIR tables? I'm so sorry. @@ -164,9 +167,9 @@ This means you may have differing schemas in Athena from one site's data to anot may differ). In order to handle this, you need to create a standard output representation that accounts for all the different permutations you have, and conform data to match that. The -[encounter coding](https://github.com/smart-on-fhir/cumulus-library/blob/main//cumulus_library/studies/core/builder_encounter_coding.py) +[encounter](https://github.com/smart-on-fhir/cumulus-library/blob/main/cumulus_library/studies/core/builder_encounter.py) and -[condition codeableConcept](https://github.com/smart-on-fhir/cumulus-library/blob/main//cumulus_library/studies/core/builder_condition_codeableconcept.py) +[condition](https://github.com/smart-on-fhir/cumulus-library/blob/main/cumulus_library/studies/core/builder_condition.py) builders both jump through hoops to try and get this data into flat tables for downstream use. @@ -186,8 +189,8 @@ template function to invoke that template - Create a distinct table that has an ID for joining back to the original - Perform this join as appropriate to create a table with unnested data -You may find it useful to use the `--builder [filename]` sub argument of the cli -build command to run just your builder for iteration. The +You may find it useful to use the `--builder [filename]` sub argument of the CLI +`build` command to run just your builder for iteration. The [Sample bulk FHIR datasets](https://github.com/smart-on-fhir/sample-bulk-fhir-datasets) can provide an additional testbed database above and beyond whatever you produce in house. diff --git a/docs/creating-studies.md b/docs/creating-studies.md index edc2456c..b02ceb0e 100644 --- a/docs/creating-studies.md +++ b/docs/creating-studies.md @@ -19,7 +19,7 @@ you'll be working on study development. `cumulus-library` will look in each subdirectory of that folder for manifest files, so you can run several studies at once. -If you're not doing this, you can always add the `--study_dir path/to/dir` argument +If you're not doing this, you can always add the `--study-dir path/to/dir` argument to any build/export call to tell it where to look for your work. ## Creating a new study @@ -33,16 +33,16 @@ cumulus-library create ./path/to/your/study/dir We'll create that folder if it doesn't already exist. 2. Fork the [ -Cumulus library template repo](https://github.com/smart-on-fhir/cumulus-library-template), +Cumulus Library template repo](https://github.com/smart-on-fhir/cumulus-library-template), renaming your fork, and cloning it directly from github. We recommend you use a name relevant to your study (we'll use `my_study` for this -document). The folder name is the same thing you will use as a target with -`cumulus_library` to run your study's queries. +document). This folder name is what you will pass as a `--target` to +`cumulus-library` when you run your study's queries. -Once you've made a new study, the `manifest.toml` file is the place you let cumulus -library know how you want your study to be run against the remote database. The -template manifest has comments describing all the possible configuration parameters +Once you've made a new study, +the `manifest.toml` file is where you can change your study's configuration. +The initial manifest has comments describing all the possible configuration parameters you can supply, but for most studies you'll have something that looks like this: ``` @@ -66,16 +66,17 @@ Talking about what these three sections do: study creates. We'll autocheck this to make sure in several places - this helps to guarantee another researcher doesn't have a study artifact that collides with yours. - - `sql_config.file_names` is a list of sql files, in order, that your study should - create. We recommend having one sql file per topic. They should all be in the same - location as your manifest file. - - `export_config.export_list` defines a list of tables to write to csv/parquet when - data is exported. Cumulus is designed with the idea of shipping around aggregate + - `sql_config.file_names` is the list of sql files that your study will run (in order). + We recommend having one sql file per topic. They should all be in the same + folder as your manifest file. + - `export_config.export_list` is the list of tables that will be downloaded + when `cumulus-library export` is run. + Cumulus is designed with the idea of shipping around aggregate counts to reduce exposure of limited datasets, and so we recommend only exporting - count tables. + "count" tables. There are other hooks you can use in the manifest for more advanced control over -how you can generate sql. See [Creating SQL with python](creating-sql-with-python.md) +how you can generate SQL. See [Creating SQL with python](creating-sql-with-python.md) for more information. We recommend creating a git repo per study, to help version your study data, which @@ -88,18 +89,25 @@ Most users have a workflow that looks like this: - Write queries in the [AWS Athena console](https://aws.amazon.com/athena/) while you are exploring the data - We recommend trying to keep your studies pointed at the `core` tables. The - base FHIR resource named tables contain a lot of nested data that can be tricky - to write cross-EHR queries against, and so you'll save yourself some headaches - if everything you need is available via those resources. If it isn't, make sure - you look at the [Creating SQL with python](creating-sql-with-python.md) guide - for information about safely extracting datasets from those tables. + raw FHIR resource tables contain a lot of nested data that can be tricky + to write cross-EHR queries against. + For example, an EHR may store Medication information in the `medication` or + the `medicationrequest` raw resource tables, + but the `core__medication` hides that complexity and is always available, + regardless of the specific EHR approach. + If you _do_ need some data that is not available in the `core` tables, + make sure you look at the + [Creating SQL with python](creating-sql-with-python.md) + guide for help to safely extract datasets from the raw resource tables. - Move queries to a file as you finalize them - Build your study with the CLI to make sure your queries load correctly. +#### sqlfluff + We use [sqlfluff](https://github.com/sqlfluff/sqlfluff) to help maintain a consistent -style across many different SQL query authors. We recommend using sqlfluff as you -are developing your queries to ensure your sql is matching the style of other -authors. We copy over our sqlfluff rules when you use `cumulus-library` to create +style across many different SQL query authors. We recommend using `sqlfluff` as you +are developing your queries to ensure your SQL is matching the style of other +authors. We copy over our `sqlfluff` rules when you use `cumulus-library` to create a study, so no additional configuration should be needed. There are two commands you will want to run inside your study's directory: @@ -107,49 +115,48 @@ There are two commands you will want to run inside your study's directory: - `sqlfluff fix` will try to make your autocorrect your queries to match the expected style -In order to both make your queries parsable to other humans, and to have sqlfluff +In order to both make your queries parsable to other humans, and to have `sqlfluff` be maximally helpful, we have a few requirements and some suggestions for query styling. -**Hard Requirements** +#### Hard Requirements For all of these, Cumulus Library will notify you if you write a query that breaks one of these rules when you build your study. - All your tables **must** start with a string like `my_study__`. - - Relatedly, **`__` is a reserved character string**. Your table names should have - exactly one of these. We :might: add other use cases for these in the future, - but as of this writing we don't plan to. - - We have **three reserved post-study prefrix substrings: `etl_`, `nlp_`, and - `lib_`** so that we don't drop tables created by other processes. These are fine - to use elsewhere; `my_study__nlp_counts` would cause an error, but - `my_study__counts_nlp` would be fine. - -**Requirements for accepting PRs** + - Relatedly, `__` (two underscores) **is a reserved character string**. + Your table names should have exactly one of these. + - We have **three reserved table prefixes:** `etl_`, `nlp_`, and `lib_`. + These are fine to use elsewhere in the table name, just not at the beginning. + For example, `my_study__nlp_counts` would cause an error, + but `my_study__counts_nlp` would be fine. + +#### Requirements for accepting PRs - **Count tables must use the CUBE function** to create powersets of data. See the - [CUBE section of groupby](https://prestodb.io/docs/current/sql/select.html#group-by-clause) - for more information about this `groupby` type. The core and template projects - provide an example of its usage. + [CUBE section of the Presto docs](https://prestodb.io/docs/current/sql/select.html#group-by-clause) + for more information about this `GROUP BY` type. + The `core` and `template` projects contain examples. - For PHI reverse identification protection, **exclude rows from count tables if - they have a small number of members**, i.e. less than 10. + they have a small number of members**, e.g. less than 10. -**Recommended** +#### Recommended - You may want to select a SQL style guide as a reference. Mozilla provides a [SQL style guide](https://docs.telemetry.mozilla.org/concepts/sql_style.html), - which our sqlfluff config enforces. If you have a different style you'd like + which our `sqlfluff` config enforces. If you have a different style you'd like to use, you can update the `.sqlfluff` config to allow this. For example, [Gitlab's data team](https://about.gitlab.com/handbook/business-technology/data-team/platform/sql-style-guide/) - has a style guide that is more centered around DBT, but is more perscriptive + has a style guide that is more centered around DBT, but is more prescriptive around formatting. - Don't implicitly reference columns tables. Either use the full table name, or give the table an alias, and use that any time you are referencing a column. - - Don't use the * wildcard in your final tables. Explicitly list the columns - with table name/alias - sqlfluff has a hard time inferring what's going on otherwise. - - We are currently asking for all caps for sql keywords like SELECT and 4 space - nesting for queries. `sqlfluff fix` will apply this for you, but it may be easier - to find other problems if you lightly adhere to this from the start. + - Don't use the `*` wildcard in your final tables. Explicitly list the columns + with table name/alias - `sqlfluff` has a hard time inferring what's going on otherwise. + - We are currently asking for all caps for SQL keywords like `SELECT` and four-space + indentation for queries. `sqlfluff fix` will apply this for you, but it may be easier + to find other problems if you adhere to this from the start. - Agggregate count tables should have the first word after the study prefix be - `count`, and otherwise the word `count` should not be used. + `count_`, and otherwise the word `count` should not be used. -**Metadata tables** +#### Metadata tables - Creating a table called `my_study__meta_date` with two columns, `min date` and `max date`, and populating it with the start and end date of your study, will allow other Cumulus tools to detect study date ranges, and otherwise bakes the @@ -160,9 +167,9 @@ styling. CREATE TABLE my_study__meta_version AS SELECT 1 AS data_package_version; ``` - allows you to signal versions for use in segregating data upstream, like in the - Cumulus aggregator - just increment it when you will need third parties to rerun - your study from scratch due to a change in your counts output. If this is not + allows you to signal versions for use in segregating data downstream, like in the + Cumulus Aggregator. Increment it when your counts output changes format, + and thus third parties need to rerun your study from scratch. If this is not set, the version will implicitly be set to zero. ## Testing studies @@ -225,4 +232,4 @@ via the [discussion forum](https://github.com/smart-on-fhir/cumulus/discussions) we can talk more about what makes sense for your use case. If you write a paper using the Cumulus library, please -[cite the project](https://smarthealthit.org/cumulus-a-universal-sidecar-for-a-smart-learning-healthcare-system/) \ No newline at end of file +[cite the project](https://smarthealthit.org/cumulus/) diff --git a/docs/first-time-setup.md b/docs/first-time-setup.md index adee07b1..c6c0f311 100644 --- a/docs/first-time-setup.md +++ b/docs/first-time-setup.md @@ -10,8 +10,8 @@ nav_order: 1 ## Installation -As a prerequisite, you'll need a copy of python 3.9 or later installed on -your system, and you'll need access to an account with access to AWS cloud services. +As a prerequisite, you'll need a copy of python 3.10 or later installed on +your system, and you'll need access to an AWS cloud services account. You can install directly from pypi by running `pip install cumulus-library`. @@ -20,24 +20,25 @@ services. See the [AWS setup guide](./aws-setup.md) for more information on this ## Command line usage -Installing adds a `cumulus-library` command for interacting with athena. +Installing adds a `cumulus-library` command for interacting with Athena. It provides several actions for users: - `create` will create a manifest file for you so you can start working on -authoring queires (more information on this in +authoring queries (more information on this in [Creating studies](./creating-studies.md)). - `build` will create new study tables, replacing previously created versions (more information on this in [Creating studies](./creating-studies.md)). - `clean` will remove studies from Athena, in case you no longer need them - `export` will output the data in the tables to both a `.csv` and `.parquet` file. The former is intended for human review, while the latter is -more compressed and should be preferred (if supported) for use when transmitting -data/loading data into analytics packages. -- `upload` will send data you exported to the cumulus aggregator +more compressed and should be preferred (if supported) for use when +loading data into analytics packages. +- `upload` will send data you exported to the +[Cumulus Aggregator](https://docs.smarthealthit.org/cumulus/aggregator/) By default, all available studies will be used by build and export, but you can use or `--target` to specify a specific study to be run. You can use it multiple -times to configure several studies in order. The `vocab`, in particular, can take a +times to configure several studies in order. The `vocab` study, in particular, can take a bit of time to generate, so we recommend using targets after your initial configuration. Several pip installable studies will automatically be added to the list of available @@ -58,11 +59,11 @@ deploy in Amazon's US-East zone. - Now we'll build the tables we'll need to run the template study. The `vocab` study creates mappings of system codes to strings, and the `core` study creates tables for commonly used base FHIR resources like `Patient` and `Observation` -using that vocab. To do this, run the following command: +using those `vocab` mappings. To do this, run the following command: ```bash cumulus-library build --target vocab --target core ``` -This usually takes around five minutes, but once it's done, you won't need build +This usually takes around five minutes, but once it's done, you won't need to build `vocab` again unless there's a coding system addition, and you'll only need to build `core` again if data changes. You should see some progress bars like this while the tables are being created: @@ -71,7 +72,8 @@ Uploading vocab__icd data... ━━━━━━━━━━━━━━━━━ Creating vocab study in db... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 0:00:00 Creating core study in db... ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 0:00:00 ``` -- Now, we'll build the template study. Run a very similar command to target `template`: +- Now, we'll build the built-in example `template` study. +Run a very similar command, but targeting `template` this time: ```bash cumulus-library build --target template ``` @@ -82,7 +84,7 @@ download designated study artifacts. To do the latter, run the following command cumulus-library export --target template ./path/to/my/data/dir/ ``` And this will download some example count aggregates to the `data_export` directory -inside of this repository. There's only a few bins, but this will give you an idea +inside of this repository. There's only a few tables, but this will give you an idea of what kind of output to expect. Here's the first few lines: ``` cnt,influenza_lab_code,influenza_result_display,influenza_test_month diff --git a/docs/sharing-data.md b/docs/sharing-data.md index e54214bb..5632d05d 100644 --- a/docs/sharing-data.md +++ b/docs/sharing-data.md @@ -28,23 +28,22 @@ before each export run. ## Uploading data to Cumulus Aggregator -As part of the Cumulus framework, we have a -[middleware application](https://github.com/smart-on-fhir/cumulus-aggregator/) -configured to receive and combine datasets from multiple organizations, which can -then be loaded into the [dashboard](https://github.com/smart-on-fhir/cumulus-app) -for SME analysis. As of this writing these are not open source, but are intended -to be in the near term. +As part of the Cumulus framework, we have a middleware application called +[Cumulus Aggregator](https://docs.smarthealthit.org/cumulus/aggregator/) +configured to receive and combine datasets from multiple organizations, +which can then be loaded into the dashboard for SME analysis. +(As of this writing, that dashboard is not yet available as open source.) We recommend configuring the following environment variables for using this script: - `CUMULUS_AGGREGATOR_USER` - `CUMULUS_AGGREGATOR_ID` -The administrator of your aggregator can help you with generating the values for +The administrator of your Aggregator instance can help you with generating the values for these variables; reach out to them for more details. With these configured, running `cumulus-library upload` will send any exported -data up to the defined aggregator instance. If you are doing something slightly +data up to the defined Aggregator instance. If you are doing something slightly more complex than participating in one clinical study with the main Cumulus project, using the `--help` flag will give you some additional configuration options that may help with your use case. \ No newline at end of file diff --git a/docs/study-list.md b/docs/study-list.md index dbf231d9..7761fb24 100644 --- a/docs/study-list.md +++ b/docs/study-list.md @@ -14,7 +14,8 @@ The following studies are available to install via pip - check the repository for more details and any associated publications. ### 2023 -[Covid symptoms] - (https://github.com/smart-on-fhir/cumulus-library-covid) +- [Covid symptoms](https://github.com/smart-on-fhir/cumulus-library-covid) +- [Suicidality length of stay](https://github.com/smart-on-fhir/cumulus-library-suicidality-los) ## Third party studies diff --git a/docs/third-party-software-citations.md b/docs/third-party-software-citations.md index a97c5e64..b39225a7 100644 --- a/docs/third-party-software-citations.md +++ b/docs/third-party-software-citations.md @@ -12,6 +12,6 @@ This file contains a list of third party software libraries associated with publications that are used by Cumulus library. These are generally domain specific to clinical research. -## (PsmPy)[https://github.com/adriennekline/psmpy] +## [PsymPy](https://github.com/adriennekline/psmpy) A. Kline and Y. Luo, PsmPy: A Package for Retrospective Cohort Matching in Python, 2022 44th Annual International Conference of the IEEE Engineering in Medicine & Biology Society (EMBC), 2022, pp. 1354-1357, doi: 10.1109/EMBC48229.2022.9871333.