diff --git a/README.md b/README.md
index a0a382e..301a691 100644
--- a/README.md
+++ b/README.md
@@ -28,9 +28,9 @@ should convince readers of the significance and relevance of your task.
## Authors & contributors
-| Name | Roles | Orcid | Twitter | Github | Email | Linkedin |
+| Name | Roles | Twitter | Email | Orcid | Github | Linkedin |
|:---|:---|:---|:---|:---|:---|:---|
-| John Doe | author, maintainer | 0000-0000-0000-0000 | johndoe | johndoe | john@doe.me | johndoe |
+| John Doe | author, maintainer | johndoe | john@doe.me | 0000-0000-0000-0000 | johndoe | johndoe |
## API
@@ -38,25 +38,32 @@ should convince readers of the significance and relevance of your task.
flowchart TB
file_common_ist("Common iST Dataset")
comp_data_processor[/"Data processor"/]
- file_spatial_dataset("Raw iST Dataset")
+ file_spatial_unlabelled("Unlabelled Spatial Dataset")
+ file_spatial_solution("Spatial Segmentation Solution")
file_scrnaseq_reference("scRNA-seq Reference")
comp_control_method[/"Control Method"/]
comp_method[/"Method"/]
+ comp_output_processor[/"Output processor"/]
comp_metric[/"Metric"/]
file_prediction("Predicted data")
+ file_processed_prediction("Processed prediction")
file_score("Score")
file_common_scrnaseq("Common SC Dataset")
file_common_ist---comp_data_processor
- comp_data_processor-->file_spatial_dataset
+ comp_data_processor-->file_spatial_unlabelled
+ comp_data_processor-->file_spatial_solution
comp_data_processor-->file_scrnaseq_reference
- file_spatial_dataset---comp_control_method
- file_spatial_dataset---comp_method
- file_scrnaseq_reference---comp_control_method
- file_scrnaseq_reference---comp_metric
+ file_spatial_unlabelled---comp_control_method
+ file_spatial_unlabelled---comp_method
+ file_spatial_unlabelled---comp_output_processor
+ file_spatial_solution---comp_control_method
+ file_spatial_solution---comp_metric
comp_control_method-->file_prediction
comp_method-->file_prediction
+ comp_output_processor-->file_processed_prediction
comp_metric-->file_score
- file_prediction---comp_metric
+ file_prediction---comp_output_processor
+ file_processed_prediction---comp_metric
file_common_scrnaseq---comp_data_processor
```
@@ -175,22 +182,24 @@ Arguments:
|:---|:---|:---|
| `--input_sp` | `file` | An unprocessed spatial imaging dataset stored as a zarr file. |
| `--input_sc` | `file` | An unprocessed dataset as output by a dataset loader. |
-| `--output_spatial_dataset` | `file` | (*Output*) A spatial transcriptomics dataset, preprocessed for this benchmark. |
+| `--output_spatial_unlabelled` | `file` | (*Output*) A spatial transcriptomics dataset, preprocessed for this benchmark. |
+| `--output_spatial_solution` | `file` | (*Output*) Ground truth segmentation labels for evaluating spatial segmentation methods. |
| `--output_scrnaseq_reference` | `file` | (*Output*) A single-cell reference dataset, preprocessed for this benchmark. |
-## File format: Raw iST Dataset
+## File format: Unlabelled Spatial Dataset
A spatial transcriptomics dataset, preprocessed for this benchmark.
Example file:
-`resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_dataset.zarr`
+`resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_unlabelled.zarr`
Description:
-This dataset contains preprocessed images, labels, points, shapes, and
-tables for spatial transcriptomics data.
+This dataset contains preprocessed images and transcript point clouds
+for spatial transcriptomics data. Ground truth segmentation labels are
+intentionally excluded to prevent methods from cheating.
Format:
@@ -198,9 +207,7 @@ Format:
SpatialData object
images: 'morphology_mip'
- labels: 'cell_labels', 'nucleus_labels'
points: 'transcripts'
- shapes: 'cell_boundaries', 'nucleus_boundaries'
tables: 'table'
coordinate_systems: 'global'
@@ -212,16 +219,9 @@ Data structure:
*images*
-| Name | Description |
-|:-----------------|:--------------------|
-| `morphology_mip` | The raw image data. |
-
-*labels*
-
-| Name | Description |
-|:-----------------|:---------------------------------------|
-| `cell_labels` | (*Optional*) Cell segmentation labels. |
-| `nucleus_labels` | (*Optional*) Cell segmentation labels. |
+| Name | Description |
+|:-----------------|:---------------------------------------------------------|
+| `morphology_mip` | The raw morphology image (maximum intensity projection). |
*points*
@@ -233,26 +233,9 @@ Data structure:
| `y` | `float` | y-coordinate of the point. |
| `z` | `float` | (*Optional*) z-coordinate of the point. |
| `feature_name` | `categorical` | Name of the feature. |
-| `cell_id` | `integer` | (*Optional*) Unique identifier of the cell. |
-| `nucleus_id` | `integer` | (*Optional*) Unique identifier of the nucleus. |
-| `cell_type` | `string` | (*Optional*) Cell type of the cell. |
| `qv` | `float` | (*Optional*) Quality value of the point. |
| `transcript_id` | `long` | Unique identifier of the transcript. |
-| `overlaps_nucleus` | `boolean` | (*Optional*) Whether the point overlaps with a nucleus. |
-
-*shapes*
-
-`cell_boundaries`: Cell boundaries.
-
-| Column | Type | Description |
-|:-----------|:---------|:-------------------------------|
-| `geometry` | `object` | Geometry of the cell boundary. |
-
-`nucleus_boundaries`: Nucleus boundaries.
-
-| Column | Type | Description |
-|:-----------|:---------|:----------------------------------|
-| `geometry` | `object` | Geometry of the nucleus boundary. |
+| `overlaps_nucleus` | `boolean` | (*Optional*) Whether the point overlaps with the nucleus (derived from morphology). |
*tables*
@@ -260,10 +243,6 @@ Data structure:
| Slot | Type | Description |
|:---|:---|:---|
-| `obs["cell_id"]` | `string` | A unique identifier for the cell. |
-| `var["gene_ids"]` | `string` | Unique identifier for the gene. |
-| `var["feature_types"]` | `string` | Type of the feature. |
-| `obsm["spatial"]` | `double` | Spatial coordinates of the cell. |
| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. |
| `uns["dataset_name"]` | `string` | A human-readable name for the dataset. |
| `uns["dataset_url"]` | `string` | Link to the original source of the dataset. |
@@ -271,7 +250,7 @@ Data structure:
| `uns["dataset_summary"]` | `string` | Short description of the dataset. |
| `uns["dataset_description"]` | `string` | Long description of the dataset. |
| `uns["dataset_organism"]` | `string` | The organism of the sample in the dataset. |
-| `uns["segmentation_id"]` | `string` | A unique identifier for the segmentation. |
+| `uns["orig_dataset_id"]` | `string` | The identifier of the original dataset from which this dataset was derived (if applicable). |
*coordinate_systems*
@@ -281,6 +260,71 @@ Data structure:
+## File format: Spatial Segmentation Solution
+
+Ground truth segmentation labels for evaluating spatial segmentation
+methods.
+
+Example file:
+`resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_solution.zarr`
+
+Description:
+
+This dataset contains the ground truth cell and nucleus segmentation
+labels, cell boundaries, and a reference table matching each cell to its
+label region.
+
+Format:
+
+
+
+ SpatialData object
+ labels: 'cell_labels', 'nucleus_labels'
+ shapes: 'cell_boundaries', 'nucleus_boundaries'
+ tables: 'table'
+
+
+
+Data structure:
+
+
+
+*labels*
+
+| Name | Description |
+|:-----------------|:-------------------------------------------------------|
+| `cell_labels` | Ground truth cell segmentation labels. |
+| `nucleus_labels` | (*Optional*) Ground truth nucleus segmentation labels. |
+
+*shapes*
+
+`cell_boundaries`: Ground truth cell boundary shapes.
+
+| Column | Type | Description |
+|:-----------|:---------|:-------------------------------|
+| `geometry` | `object` | Geometry of the cell boundary. |
+
+`nucleus_boundaries`: Ground truth nucleus boundary shapes.
+
+| Column | Type | Description |
+|:-----------|:---------|:----------------------------------|
+| `geometry` | `object` | Geometry of the nucleus boundary. |
+
+*tables*
+
+`table`: Reference cell metadata table.
+
+| Slot | Type | Description |
+|:---|:---|:---|
+| `obs["cell_id"]` | `integer` | Unique cell identifier, matching instance IDs in the label images. |
+| `obs["region"]` | `string` | Name of the label image this cell belongs to (e.g. ‘cell_labels’). |
+| `obs["cell_area"]` | `double` | (*Optional*) Area of the cell in pixels. |
+| `obs["transcript_counts"]` | `integer` | (*Optional*) Total number of transcripts assigned to this cell. |
+| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. |
+| `uns["orig_dataset_id"]` | `string` | The identifier of the original dataset from which this dataset was derived (if applicable). |
+
+
+
## File format: scRNA-seq Reference
A single-cell reference dataset, preprocessed for this benchmark.
@@ -347,7 +391,7 @@ Arguments:
| Name | Type | Description |
|:---|:---|:---|
| `--input` | `file` | A spatial transcriptomics dataset, preprocessed for this benchmark. |
-| `--input_scrnaseq_reference` | `file` | A single-cell reference dataset, preprocessed for this benchmark. |
+| `--input_solution` | `file` | Ground truth segmentation labels for evaluating spatial segmentation methods. |
| `--output` | `file` | (*Output*) A predicted dataset as output by a method. |
@@ -367,6 +411,22 @@ Arguments:
+## Component type: Output processor
+
+An output processor for the prediction.
+
+Arguments:
+
+
+
+| Name | Type | Description |
+|:---|:---|:---|
+| `--input_prediction` | `file` | A predicted dataset as output by a method. |
+| `--input_spatial_unlabelled` | `file` | A spatial transcriptomics dataset, preprocessed for this benchmark. |
+| `--output` | `file` | (*Output*) A processed predicted dataset, ready to be used as input for the evaluation. |
+
+
+
## Component type: Metric
A task template metric.
@@ -377,8 +437,8 @@ Arguments:
| Name | Type | Description |
|:---|:---|:---|
-| `--input_prediction` | `file` | A predicted dataset as output by a method. |
-| `--input_scrnaseq_reference` | `file` | A single-cell reference dataset, preprocessed for this benchmark. |
+| `--input_prediction` | `file` | A processed predicted dataset, ready to be used as input for the evaluation. |
+| `--input_solution` | `file` | Ground truth segmentation labels for evaluating spatial segmentation methods. |
| `--output` | `file` | (*Output*) File indicating the score of a metric. |
@@ -388,7 +448,7 @@ Arguments:
A predicted dataset as output by a method.
Example file:
-`resources_test/task_spatial_segmentation/mouse_brain_combined/prediction.h5ad`
+`resources_test/task_spatial_segmentation/mouse_brain_combined/prediction.zarr`
Format:
@@ -416,13 +476,59 @@ Data structure:
| Slot | Type | Description |
|:--------------------|:---------|:-------------------------------------|
-| `obs["cell_id"]` | `string` | Cell ID. |
-| `obs["region"]` | `string` | Region. |
| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. |
| `uns["method_id"]` | `string` | A unique identifier for the method. |
+## File format: Processed prediction
+
+A processed predicted dataset, ready to be used as input for the
+evaluation.
+
+Example file:
+`resources_test/task_spatial_segmentation/mouse_brain_combined/processed_prediction.h5ad`
+
+Format:
+
+
+
+ SpatialData object
+ labels: 'segmentation'
+ tables: 'table'
+
+
+
+Data structure:
+
+
+
+*labels*
+
+| Name | Description |
+|:---------------|:--------------------------|
+| `segmentation` | Segmentation of the data. |
+
+*tables*
+
+`table`: AnnData table.
+
+| Slot | Type | Description |
+|:---|:---|:---|
+| `obs["cell_id"]` | `string` | Cell ID. |
+| `obs["region"]` | `string` | Region. |
+| `var["feature_id"]` | `string` | (*Optional*) Unique identifier for the feature, usually a ENSEMBL gene id. |
+| `var["feature_name"]` | `string` | A human-readable name for the feature, usually a gene symbol. |
+| `var["hvg"]` | `boolean` | Whether or not the feature is considered to be a ‘highly variable gene’. |
+| `layers["counts"]` | `integer` | Raw counts. |
+| `layers["normalized"]` | `double` | Normalized expression values. |
+| `layers["normalized_log"]` | `double` | Log1p normalized expression values. |
+| `layers["normalized_log_scaled"]` | `double` | Log1p normalized expression values scaled to unit variance and zero mean. |
+| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. |
+| `uns["method_id"]` | `string` | A unique identifier for the method. |
+
+
+
## File format: Score
File indicating the score of a metric.
diff --git a/common b/common
index 91a35a2..f0816e1 160000
--- a/common
+++ b/common
@@ -1 +1 @@
-Subproject commit 91a35a2e808da7029e222456c0e3ca70ab3a06dd
+Subproject commit f0816e178a2b44749fdfb1a9cdfc76887dcf7462
diff --git a/scripts/create_resources/resources.sh b/scripts/create_resources/resources.sh
index 52ee226..4a921f8 100755
--- a/scripts/create_resources/resources.sh
+++ b/scripts/create_resources/resources.sh
@@ -16,9 +16,9 @@ exit 1
cat > /tmp/params.yaml << 'HERE'
input_states: s3://openproblems-data/resources/datasets/**/state.yaml
-rename_keys: 'input:output_dataset'
+rename_keys: 'input_spatial_unlabelled:output_spatial_unlabelled,input_spatial_solution:output_spatial_solution,input_scrnaseq_reference:output_scrnaseq_reference'
output_state: '$id/state.yaml'
-settings: '{"output_spatial_dataset": "$id/output_spatial_dataset.zarr", "output_scrnaseq": "$id/output_scrnaseq.h5ad"}'
+settings: '{"output_spatial_unlabelled": "$id/output_spatial_unlabelled.zarr", "output_spatial_solution": "$id/output_spatial_solution.zarr", "output_scrnaseq": "$id/output_scrnaseq.h5ad"}'
publish_dir: s3://openproblems-data/resources/task_template/datasets/
HERE
diff --git a/scripts/create_resources/test_resources.sh b/scripts/create_resources/test_resources.sh
index b11d437..7f5c0e5 100755
--- a/scripts/create_resources/test_resources.sh
+++ b/scripts/create_resources/test_resources.sh
@@ -24,7 +24,8 @@ mkdir -p $DATASET_DIR
viash run src/data_processors/process_dataset/config.vsh.yaml -- \
--input_sp $RAW_DATA/2023_10x_mouse_brain_xenium_rep1/dataset.zarr \
--input_sc $RAW_DATA/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad \
- --output_spatial_dataset $DATASET_DIR/spatial_dataset.zarr \
+ --output_spatial_unlabelled $DATASET_DIR/spatial_unlabelled.zarr \
+ --output_spatial_solution $DATASET_DIR/spatial_solution.zarr \
--output_scrnaseq_reference $DATASET_DIR/scrnaseq_reference.h5ad \
--dataset_id mouse_brain_combined \
--dataset_name "Test data mouse brain combined 2023 tenx Xenium replicate 1 2023 Yao scRNAseq" \
@@ -36,8 +37,14 @@ viash run src/data_processors/process_dataset/config.vsh.yaml -- \
# run one method
viash run src/methods/cellpose/config.vsh.yaml -- \
- --input $DATASET_DIR/spatial_dataset.zarr \
- --output $DATASET_DIR/prediction.h5ad
+ --input $DATASET_DIR/spatial_unlabelled.zarr \
+ --output $DATASET_DIR/prediction.zarr
+
+# run prediction processor
+viash run src/data_processors/process_prediction/config.vsh.yaml -- \
+ --input_prediction $DATASET_DIR/prediction.zarr \
+ --input_spatial_unlabelled $DATASET_DIR/spatial_unlabelled.zarr \
+ --output $DATASET_DIR/processed_prediction.zarr
# run one metric
# TODO: implement this!
@@ -49,9 +56,11 @@ viash run src/methods/cellpose/config.vsh.yaml -- \
# write manual state.yaml. this is not actually necessary but you never know it might be useful
cat > $DATASET_DIR/state.yaml << HERE
id: $DATASET_ID
-spatial_dataset: spatial_dataset.zarr
+spatial_unlabelled: spatial_unlabelled.zarr
+spatial_solution: spatial_solution.zarr
scrnaseq_reference: scrnaseq_reference.h5ad
prediction: prediction.h5ad
+processed_prediction: processed_prediction.zarr
score: score.h5ad
HERE
diff --git a/scripts/run_benchmark/run_full_local.sh b/scripts/run_benchmark/run_full_local.sh
index 26bba56..808df7f 100755
--- a/scripts/run_benchmark/run_full_local.sh
+++ b/scripts/run_benchmark/run_full_local.sh
@@ -31,7 +31,7 @@ publish_dir="resources/results/${RUN_ID}"
# write the parameters to file
cat > /tmp/params.yaml << HERE
input_states: resources/datasets/**/state.yaml
-rename_keys: 'input_spatial_dataset:output_spatial_dataset,input_scrnaseq_reference:output_scrnaseq_reference'
+rename_keys: 'input_spatial_unlabelled:output_spatial_unlabelled,input_spatial_solution:output_spatial_solution,input_scrnaseq_reference:output_scrnaseq_reference'
output_state: "state.yaml"
publish_dir: "$publish_dir"
HERE
diff --git a/scripts/run_benchmark/run_full_seqeracloud.sh b/scripts/run_benchmark/run_full_seqeracloud.sh
index 3c31e74..745aa77 100755
--- a/scripts/run_benchmark/run_full_seqeracloud.sh
+++ b/scripts/run_benchmark/run_full_seqeracloud.sh
@@ -23,7 +23,7 @@ publish_dir="s3://openproblems-data/resources/task_template/results/${RUN_ID}"
# write the parameters to file
cat > /tmp/params.yaml << HERE
input_states: s3://openproblems-data/resources/task_template/datasets/**/state.yaml
-rename_keys: 'input_spatial_dataset:output_spatial_dataset,input_scrnaseq_reference:output_scrnaseq_reference'
+rename_keys: 'input_spatial_unlabelled:output_spatial_unlabelled,input_spatial_solution:output_spatial_solution,input_scrnaseq_reference:output_scrnaseq_reference'
output_state: "state.yaml"
publish_dir: "$publish_dir"
HERE
diff --git a/src/api/comp_control_method.yaml b/src/api/comp_control_method.yaml
index 3f4fa2e..990602d 100644
--- a/src/api/comp_control_method.yaml
+++ b/src/api/comp_control_method.yaml
@@ -13,11 +13,11 @@ info:
in the task.
arguments:
- name: --input
- __merge__: file_spatial_dataset.yaml
+ __merge__: file_spatial_unlabelled.yaml
required: true
direction: input
- - name: "--input_scrnaseq_reference"
- __merge__: file_scrnaseq_reference.yaml
+ - name: "--input_solution"
+ __merge__: file_spatial_solution.yaml
direction: input
required: true
- name: --output
diff --git a/src/api/comp_data_processor.yaml b/src/api/comp_data_processor.yaml
index ecd3f9c..69deecc 100644
--- a/src/api/comp_data_processor.yaml
+++ b/src/api/comp_data_processor.yaml
@@ -19,8 +19,12 @@ argument_groups:
direction: input
- name: Outputs
arguments:
- - name: "--output_spatial_dataset"
- __merge__: file_spatial_dataset.yaml
+ - name: "--output_spatial_unlabelled"
+ __merge__: file_spatial_unlabelled.yaml
+ direction: output
+ required: true
+ - name: "--output_spatial_solution"
+ __merge__: file_spatial_solution.yaml
direction: output
required: true
- name: "--output_scrnaseq_reference"
@@ -80,4 +84,3 @@ test_resources:
dest: resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2
- type: python_script
path: /common/component_tests/run_and_check_output.py
-
diff --git a/src/api/comp_method.yaml b/src/api/comp_method.yaml
index 633a8a1..2acf7ef 100644
--- a/src/api/comp_method.yaml
+++ b/src/api/comp_method.yaml
@@ -8,7 +8,7 @@ info:
A method to predict the task effects.
arguments:
- name: --input
- __merge__: file_spatial_dataset.yaml
+ __merge__: file_spatial_unlabelled.yaml
required: true
direction: input
- name: --output
diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml
index a7470e9..6102919 100644
--- a/src/api/comp_metric.yaml
+++ b/src/api/comp_metric.yaml
@@ -8,11 +8,11 @@ info:
A metric for evaluating method predictions.
arguments:
- name: "--input_prediction"
- __merge__: file_prediction.yaml
+ __merge__: file_processed_prediction.yaml
direction: input
required: true
- - name: "--input_scrnaseq_reference"
- __merge__: file_scrnaseq_reference.yaml
+ - name: "--input_solution"
+ __merge__: file_spatial_solution.yaml
direction: input
required: true
- name: "--output"
diff --git a/src/api/comp_output_processor.yaml b/src/api/comp_output_processor.yaml
new file mode 100644
index 0000000..a5db8ec
--- /dev/null
+++ b/src/api/comp_output_processor.yaml
@@ -0,0 +1,30 @@
+namespace: "data_processors"
+info:
+ type: data_processor
+ type_info:
+ label: Output processor
+ summary: An output processor for the prediction.
+ description: |
+ A component for a prediction dataset into a processed prediction dataset that can be evaluated by the metrics.
+argument_groups:
+ - name: Inputs
+ arguments:
+ - name: "--input_prediction"
+ __merge__: file_prediction.yaml
+ required: true
+ direction: input
+ - name: "--input_spatial_unlabelled"
+ __merge__: file_spatial_unlabelled.yaml
+ required: true
+ direction: input
+ - name: Outputs
+ arguments:
+ - name: "--output"
+ __merge__: file_processed_prediction.yaml
+ direction: output
+ required: true
+test_resources:
+ - type: python_script
+ path: /common/component_tests/run_and_check_output.py
+ - path: /resources_test/task_spatial_segmentation/mouse_brain_combined
+ dest: resources_test/task_spatial_segmentation/mouse_brain_combined
diff --git a/src/api/file_prediction.yaml b/src/api/file_prediction.yaml
index b1fc443..e72794b 100644
--- a/src/api/file_prediction.yaml
+++ b/src/api/file_prediction.yaml
@@ -1,6 +1,6 @@
#TODO: Change to the required and/or optional fields of the anndata
type: file
-example: "resources_test/task_spatial_segmentation/mouse_brain_combined/prediction.h5ad"
+example: "resources_test/task_spatial_segmentation/mouse_brain_combined/prediction.zarr"
label: "Predicted data"
summary: A predicted dataset as output by a method.
info:
@@ -16,15 +16,6 @@ info:
name: table
description: AnnData table
required: true
- obs:
- - type: string
- name: cell_id
- description: Cell ID
- required: true
- - type: string
- name: region
- description: Region
- required: true
uns:
- type: string
name: dataset_id
diff --git a/src/api/file_processed_prediction.yaml b/src/api/file_processed_prediction.yaml
new file mode 100644
index 0000000..77dee47
--- /dev/null
+++ b/src/api/file_processed_prediction.yaml
@@ -0,0 +1,67 @@
+type: file
+example: "resources_test/task_spatial_segmentation/mouse_brain_combined/processed_prediction.h5ad"
+label: "Processed prediction"
+summary: A processed predicted dataset, ready to be used as input for the evaluation.
+info:
+ format:
+ type: spatialdata_zarr
+ labels:
+ - type: object
+ name: "segmentation"
+ description: Segmentation of the data
+ required: true
+ tables:
+ - type: anndata
+ name: table
+ description: AnnData table
+ required: true
+ # TODO: what is it that this component adds to the anndata?
+ layers:
+ - type: integer
+ name: counts
+ description: Raw counts
+ required: true
+ - type: double
+ name: normalized
+ description: Normalized expression values
+ required: true
+ - type: double
+ name: normalized_log
+ description: Log1p normalized expression values
+ required: true
+ - type: double
+ name: normalized_log_scaled
+ description: Log1p normalized expression values scaled to unit variance and zero mean
+ required: true
+ obs:
+ - type: string
+ name: cell_id
+ description: Cell ID
+ required: true
+ - type: string
+ name: region
+ description: Region
+ required: true
+ # .... cell info ... ?
+ var:
+ - type: string
+ name: feature_id
+ description: Unique identifier for the feature, usually a ENSEMBL gene id.
+ required: false
+ - type: string
+ name: feature_name
+ description: A human-readable name for the feature, usually a gene symbol.
+ required: true
+ - type: boolean
+ name: hvg
+ description: Whether or not the feature is considered to be a 'highly variable gene'
+ required: true
+ uns:
+ - type: string
+ name: dataset_id
+ description: "A unique identifier for the dataset"
+ required: true
+ - type: string
+ name: method_id
+ description: "A unique identifier for the method"
+ required: true
diff --git a/src/api/file_scrnaseq_reference.yaml b/src/api/file_scrnaseq_reference.yaml
index 9b855fd..c1da992 100644
--- a/src/api/file_scrnaseq_reference.yaml
+++ b/src/api/file_scrnaseq_reference.yaml
@@ -1,7 +1,5 @@
type: file
example: "resources_test/task_spatial_segmentation/mouse_brain_combined/scrnaseq_reference.h5ad"
-# TODO: revert to the original example once file exists
-# example: "resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_dataset.h5ad"
label: "scRNA-seq Reference"
summary: A single-cell reference dataset, preprocessed for this benchmark.
description: |
diff --git a/src/api/file_spatial_solution.yaml b/src/api/file_spatial_solution.yaml
new file mode 100644
index 0000000..84944dd
--- /dev/null
+++ b/src/api/file_spatial_solution.yaml
@@ -0,0 +1,69 @@
+type: file
+example: "resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_solution.zarr"
+label: "Spatial Segmentation Solution"
+summary: Ground truth segmentation labels for evaluating spatial segmentation methods.
+description: |
+ This dataset contains the ground truth cell and nucleus segmentation labels,
+ cell boundaries, and a reference table matching each cell to its label region.
+info:
+ format:
+ type: spatialdata_zarr
+ labels:
+ - type: object
+ name: "cell_labels"
+ description: Ground truth cell segmentation labels
+ required: true
+ - type: object
+ name: "nucleus_labels"
+ description: Ground truth nucleus segmentation labels
+ required: false
+ shapes:
+ - type: dataframe
+ name: "cell_boundaries"
+ description: Ground truth cell boundary shapes
+ required: false
+ columns:
+ - type: object
+ name: "geometry"
+ required: true
+ description: Geometry of the cell boundary
+ - type: dataframe
+ name: "nucleus_boundaries"
+ description: Ground truth nucleus boundary shapes
+ required: false
+ columns:
+ - type: object
+ name: "geometry"
+ required: true
+ description: Geometry of the nucleus boundary
+ tables:
+ - type: anndata
+ name: "table"
+ description: Reference cell metadata table
+ required: true
+ obs:
+ - type: integer
+ name: cell_id
+ description: Unique cell identifier, matching instance IDs in the label images
+ required: true
+ - type: string
+ name: region
+ description: Name of the label image this cell belongs to (e.g. 'cell_labels')
+ required: true
+ - type: double
+ name: cell_area
+ description: Area of the cell in pixels
+ required: false
+ - type: integer
+ name: transcript_counts
+ description: Total number of transcripts assigned to this cell
+ required: false
+ uns:
+ - type: string
+ name: dataset_id
+ description: A unique identifier for the dataset
+ required: true
+ - type: string
+ name: orig_dataset_id
+ required: true
+ description: The identifier of the original dataset from which this dataset was derived (if applicable)
diff --git a/src/api/file_spatial_dataset.yaml b/src/api/file_spatial_unlabelled.yaml
similarity index 53%
rename from src/api/file_spatial_dataset.yaml
rename to src/api/file_spatial_unlabelled.yaml
index 4c5253e..74056ce 100644
--- a/src/api/file_spatial_dataset.yaml
+++ b/src/api/file_spatial_unlabelled.yaml
@@ -1,28 +1,18 @@
type: file
-example: "resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_dataset.zarr"
-# TODO: revert to the original example once file exists
-# example: "resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_dataset.zarr"
-label: "Raw iST Dataset"
+example: "resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_unlabelled.zarr"
+label: "Unlabelled Spatial Dataset"
summary: A spatial transcriptomics dataset, preprocessed for this benchmark.
description: |
- This dataset contains preprocessed images, labels, points, shapes, and tables for spatial transcriptomics data.
+ This dataset contains preprocessed images and transcript point clouds for spatial transcriptomics data.
+ Ground truth segmentation labels are intentionally excluded to prevent methods from cheating.
info:
format:
type: spatialdata_zarr
images:
- type: object
name: morphology_mip
- description: The raw image data
+ description: The raw morphology image (maximum intensity projection)
required: true
- labels:
- - type: object
- name: "cell_labels"
- description: Cell segmentation labels
- required: false
- - type: object
- name: "nucleus_labels"
- description: Cell segmentation labels
- required: false
points:
- type: dataframe
name: transcripts
@@ -45,18 +35,6 @@ info:
name: feature_name
required: true
description: Name of the feature
- - type: integer
- name: "cell_id"
- required: false
- description: Unique identifier of the cell
- - type: integer
- name: "nucleus_id"
- required: false
- description: Unique identifier of the nucleus
- - type: string
- name: "cell_type"
- required: false
- description: Cell type of the cell
- type: float
name: qv
required: false
@@ -68,26 +46,7 @@ info:
- type: boolean
name: overlaps_nucleus
required: false
- description: Whether the point overlaps with a nucleus
- shapes:
- - type: dataframe
- name: "cell_boundaries"
- description: Cell boundaries
- required: false
- columns:
- - type: object
- name: "geometry"
- required: true
- description: Geometry of the cell boundary
- - type: dataframe
- name: "nucleus_boundaries"
- description: Nucleus boundaries
- required: false
- columns:
- - type: object
- name: "geometry"
- required: true
- description: Geometry of the nucleus boundary
+ description: Whether the point overlaps with the nucleus (derived from morphology)
tables:
- type: anndata
name: "table"
@@ -123,29 +82,9 @@ info:
required: true
description: The organism of the sample in the dataset
- type: string
- name: segmentation_id
- required: true
- multiple: true
- description: A unique identifier for the segmentation
- obs:
- - type: string
- name: cell_id
- required: true
- description: A unique identifier for the cell
- var:
- - type: string
- name: gene_ids
- required: true
- description: Unique identifier for the gene
- - type: string
- name: feature_types
- required: true
- description: Type of the feature
- obsm:
- - type: double
- name: spatial
+ name: orig_dataset_id
required: true
- description: Spatial coordinates of the cell
+ description: The identifier of the original dataset from which this dataset was derived (if applicable)
coordinate_systems:
- type: object
name: global
diff --git a/src/control_methods/true_labels/config.vsh.yaml b/src/control_methods/true_labels/config.vsh.yaml
index 0a71e50..7d47a02 100644
--- a/src/control_methods/true_labels/config.vsh.yaml
+++ b/src/control_methods/true_labels/config.vsh.yaml
@@ -1,59 +1,24 @@
-# The API specifies which type of component this is.
-# It contains specifications for:
-# - The input/output files
-# - Common parameters
-# - A unit test
__merge__: ../../api/comp_control_method.yaml
-# A unique identifier for your component (required).
-# Can contain only lowercase letters or underscores.
name: true_labels
-
-# A relatively short label, used when rendering visualisations (required)
label: True Labels
-# A one sentence summary of how this method works (required). Used when
-# rendering summary tables.
-summary: "a positive control, solution labels are copied 1 to 1 to the predicted data."
-# A multi-line description of how this component works (required). Used
-# when rendering reference documentation.
+summary: "A positive control where the ground truth cell_labels are used as the prediction."
description: |
- A positive control, where the solution labels are copied 1 to 1 to the predicted data.
-
-# Metadata for your component
-info:
- # Which normalisation method this component prefers to use (required).
- preferred_normalization: counts
-
-# Component-specific parameters (optional)
-# arguments:
-# - name: "--n_neighbors"
-# type: "integer"
-# default: 5
-# description: Number of neighbors to use.
+ A positive control where the ground truth cell_labels segmentation is copied
+ directly as the prediction. This represents the upper bound of performance
+ for any segmentation method.
-# Resources required to run the component
resources:
- # The script of your component (required)
- type: python_script
path: script.py
- # Additional resources your script needs (optional)
- # - type: file
- # path: weights.pt
engines:
- # Specifications for the Docker image for this component.
- type: docker
image: openproblems/base_python:1
- # Add custom dependencies here (optional). For more information, see
- # https://viash.io/reference/config/engines/docker/#setup .
- # setup:
- # - type: python
- # packages: scib==1.1.5
+ __merge__: ../../base/setup_spatialdata_partial.yaml
runners:
- # This platform allows running the component natively
- type: executable
- # Allows turning the component into a Nextflow module / pipeline.
- type: nextflow
directives:
label: [midtime, lowmem, lowcpu]
diff --git a/src/control_methods/true_labels/script.py b/src/control_methods/true_labels/script.py
index 935f3af..107804a 100644
--- a/src/control_methods/true_labels/script.py
+++ b/src/control_methods/true_labels/script.py
@@ -1,13 +1,11 @@
import anndata as ad
+import spatialdata as sd
## VIASH START
-# Note: this section is auto-generated by viash at runtime. To edit it, make changes
-# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
par = {
- 'input_train': 'resources_test/task_template/cxg_mouse_pancreas_atlas/train.h5ad',
- 'input_test': 'resources_test/task_template/cxg_mouse_pancreas_atlas/test.h5ad',
- 'input_solution': 'resources_test/task_template/cxg_mouse_pancreas_atlas/solution.h5ad',
- 'output': 'output.h5ad'
+ 'input': 'resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_unlabelled.zarr',
+ 'input_solution': 'resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_solution.zarr',
+ 'output': 'output.zarr'
}
meta = {
'name': 'true_labels'
@@ -15,31 +13,23 @@
## VIASH END
print('Reading input files', flush=True)
-input_train = ad.read_h5ad(par['input_train'])
-input_test = ad.read_h5ad(par['input_test'])
-input_solution = ad.read_h5ad(par['input_solution'])
+sdata_input = sd.read_zarr(par['input'])
+sdata_solution = sd.read_zarr(par['input_solution'])
-print('Preprocess data', flush=True)
-# ... preprocessing ...
-
-print('Train model', flush=True)
-# ... train model ...
-
-print('Generate predictions', flush=True)
-# ... generate predictions ...
-obs_label_pred = input_solution.obs["label"]
-
-print("Write output AnnData to file", flush=True)
-output = ad.AnnData(
- uns={
- 'dataset_id': input_train.uns['dataset_id'],
- 'normalization_id': input_train.uns['normalization_id'],
- 'method_id': meta['name']
+print('Copying ground truth cell_labels as prediction', flush=True)
+output = sd.SpatialData(
+ labels={
+ 'segmentation': sdata_solution['cell_labels']
},
- obs={
- 'label_pred': obs_label_pred
+ tables={
+ 'table': ad.AnnData(
+ uns={
+ 'dataset_id': sdata_solution.tables['table'].uns['dataset_id'],
+ 'method_id': meta['name']
+ }
+ )
}
)
-output.obs_names = input_test.obs_names
-output.write_h5ad(par['output'], compression='gzip')
+print('Writing output', flush=True)
+output.write(par['output'], overwrite=True)
diff --git a/src/data_processors/process_dataset/config.vsh.yaml b/src/data_processors/process_dataset/config.vsh.yaml
index 0ea6508..96cc85c 100644
--- a/src/data_processors/process_dataset/config.vsh.yaml
+++ b/src/data_processors/process_dataset/config.vsh.yaml
@@ -5,10 +5,6 @@ name: process_dataset
argument_groups:
- name: "Processing parameters"
arguments:
- - name: "--seed"
- type: "integer"
- description: "A seed for the subsampling."
- example: 123
- name: "--span"
type: double
description: The fraction of the data (cells) used when estimating the variance in the loess model fit if flavor='seurat_v3'.
@@ -24,7 +20,6 @@ resources:
engines:
- type: docker
- #image: openproblems/base_pytorch_nvidia:1 # TODO: ideally get gpu image to work
image: openproblems/base_python:1
setup:
- type: python
@@ -37,4 +32,4 @@ runners:
- type: executable
- type: nextflow
directives:
- label: [highmem, midcpu, midtime]
\ No newline at end of file
+ label: [midmem, midcpu, midtime]
diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py
index b04cb33..38f07de 100644
--- a/src/data_processors/process_dataset/script.py
+++ b/src/data_processors/process_dataset/script.py
@@ -1,4 +1,3 @@
-import random
import anndata as ad
import spatialdata as sd
import scanpy as sc
@@ -7,8 +6,9 @@
par = {
'input_sp': 'resources_test/common/2023_10x_mouse_brain_xenium_rep1/dataset.zarr',
'input_sc': 'resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad',
- 'output_spatial_dataset': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_spatial_dataset.zarr',
- 'output_scrnaseq_reference': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_scrnaseq_reference.h5ad',
+ 'output_spatial_unlabelled': 'spatial_unlabelled.zarr',
+ 'output_spatial_solution': 'spatial_solution.zarr',
+ 'output_scrnaseq_reference': 'scrnaseq_reference.h5ad',
'span': 0.3,
'seed': 123,
'n_top_genes': 3000,
@@ -53,12 +53,6 @@ def sc_processing(adata):
)
adata.var.rename(columns={"highly_variable": "hvg"}, inplace=True)
-
-# set seed if need be
-if par["seed"]:
- print(f">> Setting seed to {par['seed']}")
- random.seed(par["seed"])
-
print(">> Load data", flush=True)
sc_data = ad.read_h5ad(par["input_sc"])
print(f"single cell data: {sc_data}")
@@ -71,7 +65,7 @@ def sc_processing(adata):
for key in ["dataset_id", "dataset_name", "dataset_url", "dataset_summary", "dataset_description", "dataset_reference", "dataset_organism"]:
sc_data.uns[key] = par[key]
-print(">> Writing data", flush=True)
+print(">> Writing scrnaseq reference", flush=True)
sc_data.write_h5ad(par["output_scrnaseq_reference"], compression="gzip")
# read input_sp
@@ -79,27 +73,65 @@ def sc_processing(adata):
sp_data = sd.read_zarr(par["input_sp"])
print(f"spatial data: {sp_data}")
-print(">> Processing spatial data", flush=True)
-sp_data_table = sp_data.tables['table']
-print(f"single cell part of spatial data: {sp_data_table}")
-sc_processing(sp_data_table)
-
-if "cell_area" not in sp_data_table.obs:
- print(">> Perform scanpy qc for cell area", flush=True)
- sc.pp.calculate_qc_metrics(sp_data_table, layer="counts", inplace=True)
-
-for x in ["transcript_counts", "n_genes_by_counts"]:
- if f"ca_normalized_{x}" not in sp_data_table.obs and x in sp_data_table.obs:
- print(f">> Perform cell area normalization for {x}", flush=True)
- sp_data_table.obs[f'ca_normalized_{x}'] = sp_data_table.obs[f"{x}"] / sp_data_table.obs["cell_area"]
-
-print(">> Override dataset metadata in .uns", flush=True)
-sp_data_table.uns["orig_dataset_id"] = sp_data_table.uns.get("dataset_id", None)
-for key in ["dataset_id", "dataset_name", "dataset_url", "dataset_summary", "dataset_description", "dataset_reference", "dataset_organism"]:
- sp_data_table.uns[key] = par[key]
-
-print(f"spatial data: {sp_data}")
-print(f"spatial data tables['table']: {sp_data.tables['table']}")
+dataset_uns = {
+ "dataset_id": par["dataset_id"],
+ "dataset_name": par["dataset_name"],
+ "dataset_url": par["dataset_url"],
+ "dataset_summary": par["dataset_summary"],
+ "dataset_description": par["dataset_description"],
+ "dataset_reference": par["dataset_reference"],
+ "dataset_organism": par["dataset_organism"],
+ "orig_dataset_id": sp_data.tables["table"].uns.get("dataset_id", None),
+}
-print(">> Writing spatial data", flush=True)
-sp_data.write(par["output_spatial_dataset"], overwrite=True)
+# ---------------------------------------------------------------
+# output_spatial_dataset: image + transcripts (no ground truth)
+# ---------------------------------------------------------------
+print(">> Building spatial dataset for methods (no ground truth)", flush=True)
+
+# Strip columns that reveal ground truth cell assignments from transcripts
+_GROUND_TRUTH_COLS = {"cell_id", "nucleus_id", "cell_type"}
+transcripts = sp_data.points["transcripts"]
+clean_transcript_cols = [c for c in transcripts.columns if c not in _GROUND_TRUTH_COLS]
+clean_transcripts = transcripts[clean_transcript_cols]
+
+# Minimal table: just dataset metadata in uns, no per-cell obs
+minimal_table = ad.AnnData(uns=dataset_uns)
+
+output_spatial = sd.SpatialData(
+ images={"morphology_mip": sp_data.images["morphology_mip"]},
+ points={"transcripts": clean_transcripts},
+ tables={"table": minimal_table},
+)
+
+print(">> Writing spatial unlabelled dataset", flush=True)
+output_spatial.write(par["output_spatial_unlabelled"], overwrite=True)
+
+# ---------------------------------------------------------------
+# output_spatial_solution: ground truth labels, shapes, reference table
+# ---------------------------------------------------------------
+print(">> Building spatial solution (ground truth)", flush=True)
+
+ref_table = sp_data.tables["table"]
+solution_obs = ref_table.obs[["cell_id", "region"]].copy()
+for extra_col in ["cell_area", "transcript_counts"]:
+ if extra_col in ref_table.obs.columns:
+ solution_obs[extra_col] = ref_table.obs[extra_col]
+
+solution_table = ad.AnnData(
+ obs=solution_obs,
+ uns={
+ "dataset_id": par["dataset_id"],
+ "orig_dataset_id": sp_data.tables["table"].uns.get("dataset_id", None),
+ "spatialdata_attrs": ref_table.uns["spatialdata_attrs"],
+ },
+)
+
+output_solution = sd.SpatialData(
+ labels={k: v for k, v in sp_data.labels.items()},
+ shapes={k: v for k, v in sp_data.shapes.items()},
+ tables={"table": solution_table},
+)
+
+print(">> Writing spatial solution", flush=True)
+output_solution.write(par["output_spatial_solution"], overwrite=True)
diff --git a/src/data_processors/process_prediction/config.vsh.yaml b/src/data_processors/process_prediction/config.vsh.yaml
new file mode 100644
index 0000000..7e473ac
--- /dev/null
+++ b/src/data_processors/process_prediction/config.vsh.yaml
@@ -0,0 +1,23 @@
+__merge__: ../../api/comp_output_processor.yaml
+
+name: process_prediction
+
+resources:
+ - type: python_script
+ path: script.py
+
+engines:
+ - type: docker
+ image: openproblems/base_python:1
+ setup:
+ - type: python
+ packages: [scikit-learn, scikit-misc]
+ __merge__:
+ - /src/base/setup_spatialdata_partial.yaml
+ - type: native
+
+runners:
+ - type: executable
+ - type: nextflow
+ directives:
+ label: [midmem, midcpu, midtime]
diff --git a/src/data_processors/process_prediction/script.py b/src/data_processors/process_prediction/script.py
new file mode 100644
index 0000000..ceed915
--- /dev/null
+++ b/src/data_processors/process_prediction/script.py
@@ -0,0 +1,94 @@
+import numpy as np
+import xarray as xr
+import anndata as ad
+import pandas as pd
+import spatialdata as sd
+import scanpy as sc
+
+## VIASH START
+par = {
+ 'input_prediction': 'resources_test/task_spatial_segmentation/mouse_brain_combined/prediction.zarr',
+ 'input_spatial_unlabelled': 'resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_unlabelled.zarr',
+ 'output': 'output.zarr'
+}
+## VIASH END
+
+print(">> Reading input files", flush=True)
+sdata_pred = sd.read_zarr(par["input_prediction"])
+sdata_sp = sd.read_zarr(par["input_spatial_unlabelled"])
+
+dataset_id = sdata_sp.tables["table"].uns["dataset_id"]
+method_id = sdata_pred.tables["table"].uns["method_id"]
+
+print(">> Transforming transcripts to global coordinate system", flush=True)
+transcripts = sd.transform(sdata_sp["transcripts"], to_coordinate_system="global")
+
+# Adjust for any translation applied to the segmentation
+trans = sd.transformations.get_transformation(
+ sdata_pred["segmentation"], get_all=True
+)["global"].inverse()
+transcripts = sd.transform(transcripts, trans, "global")
+
+print(">> Assigning transcripts to cells via label image lookup", flush=True)
+y_coords = transcripts.y.compute().to_numpy(dtype=np.int64)
+x_coords = transcripts.x.compute().to_numpy(dtype=np.int64)
+
+if isinstance(sdata_pred["segmentation"], xr.DataTree):
+ label_image = sdata_pred["segmentation"]["scale0"].image.to_numpy()
+else:
+ label_image = sdata_pred["segmentation"].to_numpy()
+
+# Clip coordinates to valid label image bounds
+y_coords = np.clip(y_coords, 0, label_image.shape[0] - 1)
+x_coords = np.clip(x_coords, 0, label_image.shape[1] - 1)
+
+cell_ids = label_image[y_coords, x_coords]
+
+# NOTE: Is it useful to build a cxg count matrix? Is this used downstream?
+print(">> Building cell x gene count matrix", flush=True)
+feature_names = transcripts["feature_name"].compute().to_numpy()
+
+transcript_df = pd.DataFrame({"cell_id": cell_ids, "feature_name": feature_names})
+# Remove background (cell_id == 0)
+transcript_df = transcript_df[transcript_df["cell_id"] != 0]
+
+count_matrix = (
+ transcript_df.groupby(["cell_id", "feature_name"])
+ .size()
+ .unstack(fill_value=0)
+)
+
+obs = pd.DataFrame(
+ {"cell_id": count_matrix.index.astype(str), "region": pd.Categorical(["segmentation"] * len(count_matrix))},
+ index=count_matrix.index.astype(str),
+)
+var = pd.DataFrame(index=count_matrix.columns.astype(str))
+var.index.name = "feature_name"
+
+table = ad.AnnData(X=count_matrix.values.astype(np.float32), obs=obs, var=var)
+table.layers["counts"] = table.X.copy()
+
+print(">> Normalizing counts", flush=True)
+sc.pp.normalize_total(table, target_sum=1e4)
+table.layers["normalized"] = table.X.copy()
+
+sc.pp.log1p(table)
+table.layers["normalized_log"] = table.X.copy()
+
+sc.pp.scale(table)
+table.layers["normalized_log_scaled"] = table.X.copy()
+
+table.uns["dataset_id"] = dataset_id
+table.uns["method_id"] = method_id
+table.uns["spatialdata_attrs"] = {
+ "instance_key": "cell_id",
+ "region": ["segmentation"],
+ "region_key": "region",
+}
+
+print(">> Writing output", flush=True)
+output = sd.SpatialData(
+ labels={"segmentation": sdata_pred["segmentation"]},
+ tables={"table": table},
+)
+output.write(par["output"], overwrite=True)
diff --git a/src/methods/cellpose/script.py b/src/methods/cellpose/script.py
index 6ebae72..3f0236b 100644
--- a/src/methods/cellpose/script.py
+++ b/src/methods/cellpose/script.py
@@ -10,7 +10,7 @@
## VIASH START
par = {
- 'input': 'resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_dataset.zarr',
+ 'input': 'resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_unlabelled.zarr',
'output': 'prediction.zarr'
}
meta = {
@@ -47,24 +47,23 @@ def convert_to_lower_dtype(arr):
print('Cellpose segmentation finished, post-processing results', flush=True)
masks = convert_to_lower_dtype(masks)
-print('Segmentation done, preparing output', flush=True)
-sd_output = sd.SpatialData()
-data_array = xr.DataArray(masks, name='segmentation', dims=('y', 'x'))
-parsed = Labels2DModel.parse(data_array, transformations=transformation)
-sd_output.labels['segmentation'] = parsed
-
-cell_ids = np.unique(masks)[1:] # exclude background (0)
-table = ad.AnnData(
- obs=pd.DataFrame(
- {'cell_id': cell_ids.astype(str), 'region': 'segmentation'},
- index=cell_ids.astype(str),
- ),
- uns={
- 'dataset_id': sdata.tables['table'].uns['dataset_id'],
- 'method_id': meta['name']
+print('Creating output data structure', flush=True)
+sd_output = sd.SpatialData(
+ labels={
+ 'segmentation': Labels2DModel.parse(
+ xr.DataArray(masks, name='segmentation', dims=('y', 'x')),
+ transformations=transformation
+ )
+ },
+ tables={
+ 'table': ad.AnnData(
+ uns={
+ 'dataset_id': sdata.tables['table'].uns['dataset_id'],
+ 'method_id': meta['name']
+ }
+ )
}
)
-sd_output.tables['table'] = table
print('Saving output', flush=True)
if os.path.exists(par["output"]):
diff --git a/src/workflows/process_datasets/config.vsh.yaml b/src/workflows/process_datasets/config.vsh.yaml
index c71286a..454c63f 100644
--- a/src/workflows/process_datasets/config.vsh.yaml
+++ b/src/workflows/process_datasets/config.vsh.yaml
@@ -14,8 +14,12 @@ argument_groups:
direction: input
- name: Outputs
arguments:
- - name: "--output_spatial_dataset"
- __merge__: /src/api/file_spatial_dataset.yaml
+ - name: "--output_spatial_unlabelled"
+ __merge__: /src/api/file_spatial_unlabelled.yaml
+ direction: output
+ required: true
+ - name: "--output_spatial_solution"
+ __merge__: /src/api/file_spatial_solution.yaml
direction: output
required: true
- name: "--output_scrnaseq_reference"
diff --git a/src/workflows/process_datasets/main.nf b/src/workflows/process_datasets/main.nf
index 947a8f1..2e0c2d4 100644
--- a/src/workflows/process_datasets/main.nf
+++ b/src/workflows/process_datasets/main.nf
@@ -44,13 +44,14 @@ workflow run_wf {
"input_sc": "input_sc"
],
toState: [
- output_spatial_dataset: "output_spatial_dataset",
+ output_spatial_unlabelled: "output_spatial_unlabelled",
+ output_spatial_solution: "output_spatial_solution",
output_scrnaseq_reference: "output_scrnaseq_reference"
]
)
// only output the files for which an output file was specified
- | setState(["output_spatial_dataset", "output_scrnaseq_reference"])
+ | setState(["output_spatial_unlabelled", "output_spatial_solution", "output_scrnaseq_reference"])
emit:
output_ch
diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml
index 4ab5f83..9bfee25 100644
--- a/src/workflows/run_benchmark/config.vsh.yaml
+++ b/src/workflows/run_benchmark/config.vsh.yaml
@@ -4,13 +4,17 @@ namespace: workflows
argument_groups:
- name: Inputs
arguments:
- - name: "--input_spatial_dataset"
- __merge__: /src/api/file_spatial_dataset.yaml
- direction: output
+ - name: "--input_spatial_unlabelled"
+ __merge__: /src/api/file_spatial_unlabelled.yaml
+ direction: input
+ required: true
+ - name: "--input_spatial_solution"
+ __merge__: /src/api/file_spatial_solution.yaml
+ direction: input
required: true
- name: "--input_scrnaseq_reference"
__merge__: /src/api/file_scrnaseq_reference.yaml
- direction: output
+ direction: input
required: true
- name: Outputs
arguments:
diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf
index fe25140..9cac4f6 100644
--- a/src/workflows/run_benchmark/main.nf
+++ b/src/workflows/run_benchmark/main.nf
@@ -33,7 +33,7 @@ workflow run_wf {
// extract the dataset metadata
| extract_uns_metadata.run(
- fromState: [input: "input_solution"],
+ fromState: [input: "input_spatial_unlabelled"],
toState: { id, output, state ->
state + [
dataset_uns: readYaml(output.output).uns
@@ -52,14 +52,8 @@ workflow run_wf {
// use the 'filter' argument to only run a method on the normalisation the component is asking for
filter: { id, state, comp ->
- def norm = state.dataset_uns.normalization_id
- def pref = comp.config.info.preferred_normalization
- // if the preferred normalisation is none at all,
- // we can pass whichever dataset we want
- def norm_check = (norm == "log_cp10k" && pref == "counts") || norm == pref
def method_check = !state.method_ids || state.method_ids.contains(comp.config.name)
-
- method_check && norm_check
+ method_check
},
// define a new 'id' by appending the method name to the dataset id
@@ -70,11 +64,10 @@ workflow run_wf {
// use 'fromState' to fetch the arguments the component requires from the overall state
fromState: { id, state, comp ->
def new_args = [
- input_train: state.input_train,
- input_test: state.input_test
+ input: state.input_spatial_unlabelled
]
if (comp.config.info.type == "control_method") {
- new_args.input_solution = state.input_solution
+ new_args.input_solution = state.input_spatial_solution
}
new_args
},
@@ -96,7 +89,7 @@ workflow run_wf {
},
// use 'fromState' to fetch the arguments the component requires from the overall state
fromState: [
- input_solution: "input_solution",
+ input_solution: "input_solution",
input_prediction: "method_output"
],
// use 'toState' to publish that component's outputs to the overall state
@@ -136,10 +129,8 @@ workflow run_wf {
// extract the dataset metadata
meta_ch = dataset_ch
- // only keep one of the normalization methods
- | filter{ id, state ->
- state.dataset_uns.normalization_id == "log_cp10k"
- }
+ // only keep one entry per dataset
+ | filter{ id, state -> true }
| joinStates { ids, states ->
// store the dataset metadata in a file
def dataset_uns = states.collect{state ->