Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
521 changes: 120 additions & 401 deletions docs/guides/extensions/curator/metadata_curation.md

Large diffs are not rendered by default.

59 changes: 59 additions & 0 deletions docs/guides/extensions/curator/scripts/full_csv_workflow.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
"""
Script: Complete programmatic CSV upload and validation workflow.
Demonstrates the full end-to-end flow for power users who work
entirely through the Python client without the grid UI.
"""

from synapseclient import Synapse
from synapseclient.extensions.curator import (
create_record_based_metadata_task,
query_schema_registry,
)
from synapseclient.models import Grid

syn = Synapse()
syn.login()

# 1. Find schema and create curation task
schema_uri = query_schema_registry(
synapse_client=syn, dcc="ad", datatype="IndividualAnimalMetadataTemplate"
)

record_set, curation_task, _ = create_record_based_metadata_task(
synapse_client=syn,
project_id="syn123456789",
folder_id="syn987654321",
record_set_name="StudyMetadata",
record_set_description="Animal study metadata",
curation_task_name="StudyMetadata_Curation",
upsert_keys=["individualID"],
instructions="Complete all required fields.",
schema_uri=schema_uri,
bind_schema_to_record_set=True,
)

# 2. Import CSV data into a grid session
# Column schema is auto-derived from the CSV header and the
# JSON schema bound to the grid.
grid = Grid(record_set_id=record_set.id).create()
grid = grid.import_csv(path="metadata.csv")
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How these files are passed into this scripts should be updated to be at the top of the file as a script constant so everything is set up in a single spot.

print(f"Imported {grid.csv_import_total_count} rows")

# 3. Check validation before committing
snapshot = grid.get_snapshot()
summary = snapshot.validation_summary
print(f"Validation: {summary['valid']}/{summary['total']} valid")

if summary["invalid"] > 0:
print("Validation errors found:")
for row in snapshot.rows:
if row.validation and not row.validation.is_valid:
print(f" Row {row.row_id}: " f"{row.validation.validation_error_message}")
# Fix errors and re-import if needed...

# 4. Commit when ready
grid = grid.export_to_record_set()
print(f"Exported to RecordSet version {grid.record_set_version_number}")

# 5. Clean up
grid.delete()
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
"""
Script: Working with Grid sessions.
Covers creating sessions, importing CSV data, downloading data,
synchronizing changes, and listing/deleting sessions.
"""

from synapseclient import Synapse
from synapseclient.models import Grid, Query

syn = Synapse()
syn.login()

# Create a Grid session from a RecordSet
grid = Grid(record_set_id="syn987654321")
grid = grid.create()
print(f"Grid session: {grid.session_id}")

# Or create a Grid session from an EntityView query
grid_from_query = Grid(initial_query=Query(sql="SELECT * FROM syn123456789"))
grid_from_query = grid_from_query.create()

# Import a CSV from a local file path.
# Column names are read from the CSV header and types are resolved
# from the JSON schema bound to the grid session automatically.
grid = grid.import_csv(path="path/to/metadata.csv")

print(f"Imported {grid.csv_import_total_count} rows")
print(f" Created: {grid.csv_import_created_count}")
print(f" Updated: {grid.csv_import_updated_count}")

# Or import directly from a pandas DataFrame
import pandas as pd

df = pd.DataFrame(
{
"individualID": ["ANIMAL001", "ANIMAL002"],
"species": ["Mouse", "Mouse"],
"sex": ["female", "male"],
"genotype": ["5XFAD", "APOE4KI"],
}
)

grid = grid.import_csv(dataframe=df)

# You can also provide an explicit schema to override auto-derivation:
from synapseclient.models import Column, ColumnType

schema = [
Column(name="individualID", column_type=ColumnType.STRING),
Column(name="species", column_type=ColumnType.STRING),
Column(name="sex", column_type=ColumnType.STRING),
Column(name="genotype", column_type=ColumnType.STRING),
]

grid = grid.import_csv(path="path/to/metadata.csv", schema=schema)

# Download grid data as a local CSV file
file_path = grid.download_csv(download_location="/tmp")
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This needs to change as it was flagged:

Test results:

Issue: [B108:hardcoded_tmp_directory] Probable insecure usage of temp file/directory.
Severity: Medium Confidence: Medium
CWE: CWE-377 (https://cwe.mitre.org/data/definitions/377.html)
More Info: https://bandit.readthedocs.io/en/0.0.0/plugins/b108_hardcoded_tmp_directory.html
Location: ./docs/guides/extensions/curator/scripts/grid_session_operations.py:58:48
57 # Download grid data as a local CSV file
58 file_path = grid.download_csv(download_location="/tmp")
59 print(f"Downloaded grid data to: {file_path}")

print(f"Downloaded grid data to: {file_path}")

# Synchronize grid changes with the data source
grid = grid.synchronize()

if grid.synchronize_error_messages:
print("Synchronization errors:")
for msg in grid.synchronize_error_messages:
print(f" - {msg}")
else:
print("Synchronization successful")

# List all active grid sessions
for session in Grid.list():
print(f"Session: {session.session_id}, Source: {session.source_entity_id}")

# List sessions for a specific source
for session in Grid.list(source_id="syn987654321"):
print(f"Session: {session.session_id}")

# Delete a grid session
grid.delete()
30 changes: 30 additions & 0 deletions docs/guides/extensions/curator/scripts/manage_tasks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""
Script: Managing curation tasks.
Covers listing, updating, and deleting curation tasks.
"""

from synapseclient import Synapse
from synapseclient.models import CurationTask

syn = Synapse()
syn.login()

# List tasks in a project
for task in CurationTask.list(project_id="syn123456789"):
print(f"Task {task.task_id}: {task.data_type}")
print(f" Instructions: {task.instructions}")
if task.assignee_principal_id:
print(f" Assigned to: {task.assignee_principal_id}")

# Update a task
task = CurationTask(task_id=42).get()
task.instructions = "Updated instructions for data contributors"
task = task.store()

# Delete a task (simple)
task = CurationTask(task_id=42)
task.delete()

# Delete a task and clean up the associated EntityView (file-based only)
task = CurationTask(task_id=42)
task.delete(delete_file_view=True)
9 changes: 9 additions & 0 deletions docs/guides/extensions/curator/scripts/metadata.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
individualID,species,sex,genotype,modelSystemName,ageDeath,ageDeathUnit,brainWeight,tissueWeight,bedding,waterpH,lightCycle,roomTemperature,roomHumidity
IND-001,Mus musculus,male,5XFAD/WT,5XFAD,6,months,0.42,0.38,corn cob,7.0,12/12,22,50
IND-002,Mus musculus,female,5XFAD/WT,5XFAD,6,months,0.41,0.37,corn cob,7.0,12/12,22,50
IND-003,Mus musculus,male,WT/WT,wildtype,6,months,0.44,0.40,corn cob,7.0,12/12,22,50
IND-004,Mus musculus,female,WT/WT,wildtype,6,months,0.43,0.39,corn cob,7.0,12/12,22,50
IND-005,Mus musculus,male,5XFAD/WT,5XFAD,12,months,0.40,0.36,corn cob,7.0,12/12,22,52
IND-006,Mus musculus,female,5XFAD/WT,5XFAD,12,months,0.39,0.35,corn cob,7.0,12/12,22,52
IND-007,Mus musculus,male,WT/WT,wildtype,12,months,0.45,0.41,corn cob,7.0,12/12,22,52
IND-008,Mus musculus,female,WT/WT,wildtype,12,months,0.44,0.40,corn cob,7.0,12/12,22,52
34 changes: 34 additions & 0 deletions docs/guides/extensions/curator/scripts/postcommit_validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
"""
Script: Post-commit validation via RecordSet export.
Exports grid session to RecordSet (commits changes) and retrieves
detailed per-row validation results.
"""

from synapseclient import Synapse
from synapseclient.models import Grid, RecordSet

syn = Synapse()
syn.login()

grid = Grid(record_set_id="syn987654321")
grid = grid.create()

# Export to RecordSet (commits changes + generates validation)
grid = grid.export_to_record_set()

if grid.validation_summary_statistics:
stats = grid.validation_summary_statistics
print(f"Valid: {stats.number_of_valid_children}")
print(f"Invalid: {stats.number_of_invalid_children}")

# Clean up the grid session
grid.delete()

# Get detailed per-row validation from the RecordSet
record_set = RecordSet(id="syn987654321").get()
validation_df = record_set.get_detailed_validation_results()

if validation_df is not None:
invalid = validation_df[validation_df["is_valid"] == False] # noqa: E712
for _, row in invalid.iterrows():
print(f"Row {row['row_index']}: {row['validation_error_message']}")
29 changes: 29 additions & 0 deletions docs/guides/extensions/curator/scripts/precommit_validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
"""
Script: Pre-commit validation via WebSocket snapshot.
Gets per-row validation results from an active grid session
WITHOUT committing changes.
"""

from synapseclient import Synapse
from synapseclient.models import Grid

syn = Synapse()
syn.login()

grid = Grid(record_set_id="syn987654321")
grid = grid.create()

# (Import data into the grid first — see grid_session_operations.py)

# Get validation results without committing
snapshot = grid.get_snapshot()

print(f"Validation summary: {snapshot.validation_summary}")
# Example output: {'total': 100, 'valid': 85, 'invalid': 12, 'pending': 3}

# Inspect individual row validation
for row in snapshot.rows:
if row.validation and not row.validation.is_valid:
print(f"Row {row.row_id}: {row.validation.validation_error_message}")
for msg in row.validation.all_validation_messages or []:
print(f" - {msg}")
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
"""
Script: Setting up curation workflows.
Covers authentication, schema lookup, and creating both
record-based and file-based curation tasks.
"""

from synapseclient import Synapse
from synapseclient.extensions.curator import (
create_file_based_metadata_task,
create_record_based_metadata_task,
query_schema_registry,
)

syn = Synapse()
syn.login()

# Find the latest schema for a specific data type
schema_uri = query_schema_registry(
synapse_client=syn,
dcc="ad",
datatype="IndividualAnimalMetadataTemplate",
)
print(f"Schema URI: {schema_uri}")

# Browse all available versions of a schema
all_schemas = query_schema_registry(
synapse_client=syn,
dcc="ad",
datatype="IndividualAnimalMetadataTemplate",
return_latest_only=False,
)

# Create a record-based curation task
record_set, curation_task, data_grid = create_record_based_metadata_task(
synapse_client=syn,
project_id="syn123456789",
folder_id="syn987654321",
record_set_name="AnimalStudy_Records",
record_set_description="Metadata for animal study specimens",
curation_task_name="AnimalStudy_Curation",
upsert_keys=["individualID"],
instructions="Complete all required fields for each animal.",
schema_uri=schema_uri,
bind_schema_to_record_set=True,
assignee_principal_id="123456", # Optional: assign to user or team
)

print(f"RecordSet: {record_set.id}")
print(f"CurationTask: {curation_task.task_id}")

# Create a file-based curation task
entity_view_id, task_id = create_file_based_metadata_task(
synapse_client=syn,
folder_id="syn987654321",
curation_task_name="FileAnnotations_Curation",
instructions="Annotate each file according to the schema.",
entity_view_name="Animal Study Files View",
schema_uri=schema_uri,
assignee_principal_id="123456", # Optional
)

print(f"EntityView: {entity_view_id}")
print(f"CurationTask: {task_id}")
22 changes: 22 additions & 0 deletions docs/guides/extensions/curator/scripts/validate_folder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
"""
Script: Validating folder annotations.
For file-based workflows, validates annotations on files
within a schema-bound folder.
"""

from synapseclient import Synapse
from synapseclient.models import Folder

syn = Synapse()
syn.login()

folder = Folder(id="syn987654321").get()

# Get summary statistics
stats = folder.get_schema_validation_statistics()
print(f"Valid: {stats.number_of_valid_children}")
print(f"Invalid: {stats.number_of_invalid_children}")

# Get details for invalid files
for result in folder.get_invalid_validation():
print(f"Entity {result.object_id}: {result.validation_error_message}")
7 changes: 7 additions & 0 deletions docs/reference/experimental/async/curator.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,13 @@ at your own risk.
members:
- create_async
- export_to_record_set_async
- import_csv_async
- download_csv_async
- synchronize_async
- get_snapshot_async
- get_validation_async
- delete_async
- list_async
---
[](){ #query-reference-async }
::: synapseclient.models.Query
Expand Down
25 changes: 25 additions & 0 deletions docs/reference/experimental/sync/curator.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,31 @@ at your own risk.
members:
- create
- export_to_record_set
- import_csv
- download_csv
- synchronize
- get_snapshot
- get_validation
- delete
- list
---
[](){ #grid-snapshot-reference }
::: synapseclient.models.GridSnapshot
options:
inherited_members: true
members:
---
[](){ #grid-row-reference }
::: synapseclient.models.GridRow
options:
inherited_members: true
members:
---
[](){ #grid-row-validation-reference }
::: synapseclient.models.GridRowValidation
options:
inherited_members: true
members:
---
[](){ #query-reference }
::: synapseclient.models.Query
Expand Down
2 changes: 2 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ install_requires =
async-lru~=2.0.4
psutil>=5.9.8
setuptools>=80.10.1
websockets>=12.0
cbor2>=5.0
tests_require =
pytest~=8.2.0
pytest-mock>=3.0,<4.0
Expand Down
Loading
Loading