Skip to content
133 changes: 133 additions & 0 deletions docs/guides/extensions/curator/metadata_curation.md
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,139 @@ else:
print("No validation results available. The Grid session must be exported to generate validation results.")
```

### Example: Getting data into a Grid for a file-based workflow

The following example is for file-based curation.
It assumes your data is in a CSV file where each column is a property.

Here is the csv used in the example:

```csv
SampleID,Component,PatientID,TissueStatus
id1,biospecimen,pid1,Healthy
id1,biospecimen,pid2,Malignent
```

```python
import pandas as pd
import uuid
from synapseclient import Synapse
from synapseclient.models import CurationTask, Folder, File
from synapseclient.core.utils import make_bogus_data_file
from synapseclient.extensions.curator import create_file_based_metadata_task

# 1. Replace all these values with your own information
PROJECT_ID = "syn68175188"
FOLDER_NAME = f"Biospecimen Curation Folder {uuid.uuid4().hex[:8]}"
CSV_PATH = "biospecimen.csv"
JSON_SCHEMA_URI = "dpetest-test.schematic.Biospecimen"
CURATION_TASK_NAME = f"File-based curation task for biospecimens {uuid.uuid4().hex[:8]}"
INSTRUCTIONS = "Please curate the biospecimen information."

# 2. Login to Synapse
syn = Synapse()
syn.login()

# 3. Get annotations from CSV file
annotations = pd.read_csv(CSV_PATH).to_dict(orient="records")
Copy link
Member

@thomasyu888 thomasyu888 Mar 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This currently bypasses the grid altogether by pushing data in as annotations. Can we add a TODO comment here?

Nit: Could we leverage the entity view API to push the CSV directly into the entity view?


# 4. Create a folder to store the file that will be used for curation
folder = Folder(name=FOLDER_NAME, parent_id=PROJECT_ID)
folder = folder.store(synapse_client=syn)
print(f"Created folder with ID: {folder.id}")

# 5. Create files, annotate them, and store them in the Synapse folder
path_to_file1 = make_bogus_data_file(n=5)
file1 = File(path=path_to_file1, parent_id=folder.id, annotations=annotations[0])
file1 = file1.store(synapse_client=syn)
print(f"Created file with ID: {file1.id}")
path_to_file2 = make_bogus_data_file(n=5)
file2 = File(path=path_to_file2, parent_id=folder.id, annotations=annotations[1])
file2 = file2.store(synapse_client=syn)
print(f"Created file with ID: {file2.id}")

# 6. Create EntityView and CurationTask
view_id, task_id = create_file_based_metadata_task(
folder_id=folder.id,
curation_task_name=CURATION_TASK_NAME,
instructions=INSTRUCTIONS,
schema_uri=JSON_SCHEMA_URI,
synapse_client=syn,
)
print(f"Created EntityView with ID: {view_id}")
print(f"Created CurationTask with ID: {task_id}")

# 7. Cleanup all Synapse entities created
folder.delete(synapse_client=syn)
CurationTask(task_id=task_id).delete(synapse_client=syn, delete_source=True)
```

### Example: Getting data into a Grid for a record-based workflow

The following example is for record-based curation.
It assumes your data is in a CSV file where each column is a property.

Here is the csv used in the example:

```csv
Sex,Component,Diagnosis,PatientID,CancerType,YearofBirth,FamilyHistory
Male,Patient,Healthy,id1,,1970,
Female,Patient,Healthy,id2,,1980,
```

```python
import uuid
from synapseclient import Synapse
from synapseclient.models import Folder
from synapseclient.extensions.curator import create_record_based_metadata_task

# 1. Replace all these values with your own information
PROJECT_ID = "syn68175188"
FOLDER_NAME = f"Patient Curation Folder {uuid.uuid4().hex[:8]}"
CSV_PATH = "patient.csv"
JSON_SCHEMA_URI = "dpetest-test.schematic.Patient"
CURATION_TASK_NAME = f"Record-based curation task for patients {uuid.uuid4().hex[:8]}"
INSTRUCTIONS = "Please curate the patient information."
RECORD_SET_NAME = f"Patient Record Set {uuid.uuid4().hex[:8]}"
RECORD_SET_DESCRIPTION = "A record set for patients created for a record-based curation task example."
UPSERT_KEYS = ["PatientID"]

# 2. Login to Synapse
syn = Synapse()
syn.login()

# 3. Create a folder to store the RecordSet in
folder = Folder(name=FOLDER_NAME, parent_id=PROJECT_ID)
folder = folder.store(synapse_client=syn)
print(f"Created folder with ID: {folder.id}")

# 4. Create RecordSet, CurationTask, and Grid
record_set, task, grid = create_record_based_metadata_task(
folder_id=folder.id,
record_set_name=RECORD_SET_NAME,
record_set_description=RECORD_SET_DESCRIPTION,
curation_task_name=CURATION_TASK_NAME,
upsert_keys=UPSERT_KEYS,
instructions=INSTRUCTIONS,
schema_uri=JSON_SCHEMA_URI,
synapse_client=syn,
)
print(f"Created RecordSet with ID: {record_set.id}")
print(f"Created CurationTask with ID: {task.task_id}")
print(f"Created Grid with ID: {grid.session_id}")

# 5. Store the record set with the path to the CSV file as an annotation.
# TODO: https://sagebionetworks.jira.com/browse/SYNPY-1781
# Once SYNPY-1781 is finished add code here for uploading data from a CSV file into a grid session.
record_set.get(synapse_client=syn)
record_set.path = CSV_PATH
record_set = record_set.store(synapse_client=syn)

# 6. Cleanup all Synapse entities created
folder.delete(synapse_client=syn)
task.delete(synapse_client=syn, delete_source=True)
```

### Example: Complete validation workflow for animal study metadata

This example demonstrates the full workflow from creating a curation task through validating the submitted metadata:
Expand Down
Loading