flowchart TB
v0(Channel.fromList)
v2(filter)
v10(filter)
v18(align_query_reference)
v25(cross)
v35(cross)
v41(filter)
v52(filter)
v64(cross)
v74(cross)
v84(concat)
v91(cross)
v101(cross)
v107(flatMap)
v111(filter)
v119(concatenate_h5mu)
v126(cross)
v136(cross)
v142(filter)
v150(highly_variable_features_scanpy)
v157(cross)
v167(cross)
v173(filter)
v181(delete_aligned_lognormalized_counts_layer)
v188(cross)
v198(cross)
v204(filter)
v212(filter)
v227(cross)
v237(cross)
v243(filter)
v397(concat)
v252(filter)
v267(cross)
v277(cross)
v286(branch)
v313(concat)
v298(cross)
v308(cross)
v317(branch)
v344(concat)
v329(cross)
v339(cross)
v348(branch)
v375(concat)
v360(cross)
v370(cross)
v382(cross)
v392(cross)
v404(cross)
v414(cross)
v420(filter)
v428(delete_aligned_counts_layer)
v435(cross)
v445(cross)
v451(filter)
v462(filter)
v474(cross)
v484(cross)
v494(concat)
v501(cross)
v511(cross)
v519(filter)
v527(knn)
v534(cross)
v544(cross)
v552(mix)
v555(filter)
v585(concat)
v563(merge)
v570(cross)
v580(cross)
v592(cross)
v599(cross)
v611(cross)
v618(cross)
v622(Output)
subgraph group_split_modalities [split_modalities]
v57(split_modalities_component)
end
subgraph group_scvi_leiden_workflow [scvi_leiden_workflow]
v220(scvi)
v260(find_neighbors)
v291(leiden)
v322(move_obsm_to_obs)
v353(umap)
end
subgraph group_split_h5mu [split_h5mu]
v467(split_h5mu_component)
end
v286-->v313
v317-->v344
v348-->v375
v0-->v2
v2-->v10
v10-->v18
v18-->v25
v10-->v25
v10-->v35
v52-->v57
v57-->v64
v52-->v64
v52-->v74
v84-->v91
v41-->v91
v41-->v101
v111-->v119
v119-->v126
v111-->v126
v111-->v136
v142-->v150
v150-->v157
v142-->v157
v142-->v167
v173-->v181
v181-->v188
v173-->v188
v173-->v198
v204-->v212
v212-->v220
v220-->v227
v212-->v227
v212-->v237
v243-->v252
v252-->v260
v260-->v267
v252-->v267
v252-->v277
v286-->v291
v291-->v298
v286-->v298
v286-->v308
v308-->v313
v317-->v322
v322-->v329
v317-->v329
v317-->v339
v339-->v344
v348-->v353
v353-->v360
v348-->v360
v348-->v370
v370-->v375
v375-->v382
v243-->v382
v243-->v392
v392-->v397
v397-->v404
v204-->v404
v204-->v414
v420-->v428
v428-->v435
v420-->v435
v420-->v445
v462-->v467
v467-->v474
v462-->v474
v462-->v484
v494-->v501
v451-->v501
v451-->v511
v519-->v527
v527-->v534
v519-->v534
v519-->v544
v555-->v563
v563-->v570
v555-->v570
v555-->v580
v580-->v585
v585-->v592
v2-->v592
v592-->v599
v2-->v599
v2-->v611
v611-->v618
v2-->v618
v618-->v622
v35-->v41
v18-->v35
v101-->v107
v41-->v52
v74-->v84
v57-->v74
v84-->v101
v107-->v111
v136-->v142
v119-->v136
v167-->v173
v150-->v167
v198-->v204
v181-->v198
v414-->v420
v237-->v243
v220-->v237
v260-->v277
v277-->v286
v291-->v308
v313-->v317
v322-->v339
v344-->v348
v353-->v370
v375-->v392
v397-->v414
v445-->v451
v428-->v445
v511-->v519
v451-->v462
v484-->v494
v467-->v484
v494-->v511
v544-->v552
v527-->v544
v107-->v552
v552-->v555
v563-->v580
v585-->v611
style group_split_modalities fill:#F0F0F0,stroke:#969696;
style group_scvi_leiden_workflow fill:#F0F0F0,stroke:#969696;
style group_split_h5mu fill:#F0F0F0,stroke:#969696;
style v0 fill:#e3dcea,stroke:#7a4baa;
style v2 fill:#e3dcea,stroke:#7a4baa;
style v10 fill:#e3dcea,stroke:#7a4baa;
style v18 fill:#e3dcea,stroke:#7a4baa;
style v25 fill:#e3dcea,stroke:#7a4baa;
style v35 fill:#e3dcea,stroke:#7a4baa;
style v41 fill:#e3dcea,stroke:#7a4baa;
style v52 fill:#e3dcea,stroke:#7a4baa;
style v57 fill:#e3dcea,stroke:#7a4baa;
style v64 fill:#e3dcea,stroke:#7a4baa;
style v74 fill:#e3dcea,stroke:#7a4baa;
style v84 fill:#e3dcea,stroke:#7a4baa;
style v91 fill:#e3dcea,stroke:#7a4baa;
style v101 fill:#e3dcea,stroke:#7a4baa;
style v107 fill:#e3dcea,stroke:#7a4baa;
style v111 fill:#e3dcea,stroke:#7a4baa;
style v119 fill:#e3dcea,stroke:#7a4baa;
style v126 fill:#e3dcea,stroke:#7a4baa;
style v136 fill:#e3dcea,stroke:#7a4baa;
style v142 fill:#e3dcea,stroke:#7a4baa;
style v150 fill:#e3dcea,stroke:#7a4baa;
style v157 fill:#e3dcea,stroke:#7a4baa;
style v167 fill:#e3dcea,stroke:#7a4baa;
style v173 fill:#e3dcea,stroke:#7a4baa;
style v181 fill:#e3dcea,stroke:#7a4baa;
style v188 fill:#e3dcea,stroke:#7a4baa;
style v198 fill:#e3dcea,stroke:#7a4baa;
style v204 fill:#e3dcea,stroke:#7a4baa;
style v212 fill:#e3dcea,stroke:#7a4baa;
style v220 fill:#e3dcea,stroke:#7a4baa;
style v227 fill:#e3dcea,stroke:#7a4baa;
style v237 fill:#e3dcea,stroke:#7a4baa;
style v243 fill:#e3dcea,stroke:#7a4baa;
style v397 fill:#e3dcea,stroke:#7a4baa;
style v252 fill:#e3dcea,stroke:#7a4baa;
style v260 fill:#e3dcea,stroke:#7a4baa;
style v267 fill:#e3dcea,stroke:#7a4baa;
style v277 fill:#e3dcea,stroke:#7a4baa;
style v286 fill:#e3dcea,stroke:#7a4baa;
style v313 fill:#e3dcea,stroke:#7a4baa;
style v291 fill:#e3dcea,stroke:#7a4baa;
style v298 fill:#e3dcea,stroke:#7a4baa;
style v308 fill:#e3dcea,stroke:#7a4baa;
style v317 fill:#e3dcea,stroke:#7a4baa;
style v344 fill:#e3dcea,stroke:#7a4baa;
style v322 fill:#e3dcea,stroke:#7a4baa;
style v329 fill:#e3dcea,stroke:#7a4baa;
style v339 fill:#e3dcea,stroke:#7a4baa;
style v348 fill:#e3dcea,stroke:#7a4baa;
style v375 fill:#e3dcea,stroke:#7a4baa;
style v353 fill:#e3dcea,stroke:#7a4baa;
style v360 fill:#e3dcea,stroke:#7a4baa;
style v370 fill:#e3dcea,stroke:#7a4baa;
style v382 fill:#e3dcea,stroke:#7a4baa;
style v392 fill:#e3dcea,stroke:#7a4baa;
style v404 fill:#e3dcea,stroke:#7a4baa;
style v414 fill:#e3dcea,stroke:#7a4baa;
style v420 fill:#e3dcea,stroke:#7a4baa;
style v428 fill:#e3dcea,stroke:#7a4baa;
style v435 fill:#e3dcea,stroke:#7a4baa;
style v445 fill:#e3dcea,stroke:#7a4baa;
style v451 fill:#e3dcea,stroke:#7a4baa;
style v462 fill:#e3dcea,stroke:#7a4baa;
style v467 fill:#e3dcea,stroke:#7a4baa;
style v474 fill:#e3dcea,stroke:#7a4baa;
style v484 fill:#e3dcea,stroke:#7a4baa;
style v494 fill:#e3dcea,stroke:#7a4baa;
style v501 fill:#e3dcea,stroke:#7a4baa;
style v511 fill:#e3dcea,stroke:#7a4baa;
style v519 fill:#e3dcea,stroke:#7a4baa;
style v527 fill:#e3dcea,stroke:#7a4baa;
style v534 fill:#e3dcea,stroke:#7a4baa;
style v544 fill:#e3dcea,stroke:#7a4baa;
style v552 fill:#e3dcea,stroke:#7a4baa;
style v555 fill:#e3dcea,stroke:#7a4baa;
style v585 fill:#e3dcea,stroke:#7a4baa;
style v563 fill:#e3dcea,stroke:#7a4baa;
style v570 fill:#e3dcea,stroke:#7a4baa;
style v580 fill:#e3dcea,stroke:#7a4baa;
style v592 fill:#e3dcea,stroke:#7a4baa;
style v599 fill:#e3dcea,stroke:#7a4baa;
style v611 fill:#e3dcea,stroke:#7a4baa;
style v618 fill:#e3dcea,stroke:#7a4baa;
style v622 fill:#e3dcea,stroke:#7a4baa;
scVI Annotation
Cell type annotation workflow that performs scVI integration of reference and query dataset followed by KNN label transfer.
Info
ID: scvi_knn
Namespace: workflows/annotation
Links
The query and reference datasets are expected to be pre-processed in the same way, for example with the process_samples workflow of OpenPipeline. Note that this workflow will integrate the reference dataset from scratch and integrate the query dataset in the same embedding space. The workflow does not currently output the trained SCVI reference model
Example commands
You can run the pipeline using nextflow run.
View help
You can use --help as a parameter to get an overview of the possible parameters.
nextflow run openpipelines-bio/openpipeline \
-r 2.1.1 -latest \
-main-script target/nextflow/workflows/annotation/scvi_knn/main.nf \
--helpRun command
Example of params.yaml
# Query Input
id: # please fill in - example: "foo"
input: # please fill in - example: "input.h5mu"
modality: "rna"
# input_layer: "counts"
# input_layer_lognormalized: "log_normalized"
input_obs_batch_label: # please fill in - example: "sample"
# input_var_gene_names: "foo"
input_reference_gene_overlap: 100
overwrite_existing_key: false
# Reference input
reference: # please fill in - example: "reference.h5mu"
# reference_layer: "counts"
# reference_layer_lognormalized: "log_normalized"
reference_obs_target: # please fill in - example: "cell_type"
# reference_var_gene_names: "foo"
reference_obs_batch_label: # please fill in - example: "sample"
# HVG subset arguments
n_hvg: 2000
# scVI integration options
# scvi_early_stopping: true
scvi_early_stopping_monitor: "elbo_validation"
scvi_early_stopping_patience: 45
scvi_early_stopping_min_delta: 0.0
# scvi_max_epochs: 123
scvi_reduce_lr_on_plateau: true
scvi_lr_factor: 0.6
scvi_lr_patience: 30.0
# Leiden clustering options
leiden_resolution: [1.0]
# Neighbor classifier arguments
knn_weights: "uniform"
knn_n_neighbors: 15
# Outputs
# output: "$id.$key.output.h5mu"
# output_obs_predictions: ["foo"]
# output_obs_probability: ["foo"]
output_obsm_integrated: "X_integrated_scvi"
# Nextflow input-output arguments
publish_dir: # please fill in - example: "output/"
# param_list: "my_params.yaml"
# Argumentsnextflow run openpipelines-bio/openpipeline \
-r 2.1.1 -latest \
-profile docker \
-main-script target/nextflow/workflows/annotation/scvi_knn/main.nf \
-params-file params.yaml
Note
Replace -profile docker with -profile podman or -profile singularity depending on the desired backend.
Argument groups
Query Input
| Name | Description | Attributes |
|---|---|---|
--id |
ID of the sample. | string, required, example: "foo" |
--input |
Input dataset consisting of the (unlabeled) query observations. | file, required, example: "input.h5mu" |
--modality |
Which modality to process. Should match the modality of the –reference dataset. | string, default: "rna" |
--input_layer |
The layer of the input dataset containing the raw counts if .X is not to be used. | string, example: "counts" |
--input_layer_lognormalized |
The layer of the input dataset containing the lognormalized counts if .X is not to be used. | string, example: "log_normalized" |
--input_obs_batch_label |
The .obs field in the input (query) dataset containing the batch labels. | string, required, example: "sample" |
--input_var_gene_names |
The .var field in the input (query) dataset containing gene names; if not provided, the .var index will be used. | string |
--input_reference_gene_overlap |
The minimum number of genes present in both the reference and query datasets. | integer, default: 100 |
--overwrite_existing_key |
If provided, will overwrite existing fields in the input dataset when data are copied during the reference alignment process. | boolean_true |
Reference input
| Name | Description | Attributes |
|---|---|---|
--reference |
Reference dataset consisting of the labeled observations. | file, required, example: "reference.h5mu" |
--reference_layer |
The layer of the reference dataset containing the raw counts if .X is not to be used. | string, example: "counts" |
--reference_layer_lognormalized |
The layer of the reference dataset containing the lognormalized counts if .X is not to be used. | string, example: "log_normalized" |
--reference_obs_target |
The .obs key(s) of the target labels to transfer. |
string, required, example: "cell_type" |
--reference_var_gene_names |
The .var field in the reference dataset containing gene names; if not provided, the .var index will be used. | string |
--reference_obs_batch_label |
The .obs field in the reference dataset containing the batch labels. | string, required, example: "sample" |
HVG subset arguments
| Name | Description | Attributes |
|---|---|---|
--n_hvg |
Number of highly variable genes to subset for. | integer, default: 2000 |
scVI integration options
| Name | Description | Attributes |
|---|---|---|
--scvi_early_stopping |
Whether to perform early stopping with respect to the validation set. | boolean |
--scvi_early_stopping_monitor |
Metric logged during validation set epoch. | string, default: "elbo_validation" |
--scvi_early_stopping_patience |
Number of validation epochs with no improvement after which training will be stopped. | integer, default: 45 |
--scvi_early_stopping_min_delta |
Minimum change in the monitored quantity to qualify as an improvement, i.e. an absolute change of less than min_delta, will count as no improvement. | double, default: 0 |
--scvi_max_epochs |
Number of passes through the dataset, defaults to (20000 / number of cells) * 400 or 400; whichever is smallest. | integer |
--scvi_reduce_lr_on_plateau |
Whether to monitor validation loss and reduce learning rate when validation set lr_scheduler_metric plateaus. |
boolean, default: TRUE |
--scvi_lr_factor |
Factor to reduce learning rate. | double, default: 0.6 |
--scvi_lr_patience |
Number of epochs with no improvement after which learning rate will be reduced. | double, default: 30 |
Leiden clustering options
| Name | Description | Attributes |
|---|---|---|
--leiden_resolution |
Control the coarseness of the clustering. Higher values lead to more clusters. | List of double, default: 1, multiple_sep: ";" |
Neighbor classifier arguments
| Name | Description | Attributes |
|---|---|---|
--knn_weights |
Weight function used in prediction. Possible values are: uniform (all points in each neighborhood are weighted equally) or distance (weight points by the inverse of their distance) |
string, default: "uniform" |
--knn_n_neighbors |
The number of neighbors to use in k-neighbor graph structure used for fast approximate nearest neighbor search with PyNNDescent. Larger values will result in more accurate search results at the cost of computation time. | integer, default: 15 |
Outputs
| Name | Description | Attributes |
|---|---|---|
--output |
The query data in .h5mu format with predicted labels predicted from the classifier trained on the reference. | file, required, example: "output.h5mu" |
--output_obs_predictions |
In which .obs slots to store the predicted cell labels. If provided, must have the same length as --reference_obs_targets. If empty, will default to the reference_obs_targets combined with the "_pred" suffix. |
List of string, multiple_sep: ";" |
--output_obs_probability |
In which .obs slots to store the probability of the predictions. If provided, must have the same length as --reference_obs_targets. If empty, will default to the reference_obs_targets combined with the "_probability" suffix. |
List of string, multiple_sep: ";" |
--output_obsm_integrated |
In which .obsm slot to store the integrated embedding. | string, default: "X_integrated_scvi" |