diff --git a/docs/guides/extensions/curator/metadata_curation.md b/docs/guides/extensions/curator/metadata_curation.md index ea13b93f2..60e3375cf 100644 --- a/docs/guides/extensions/curator/metadata_curation.md +++ b/docs/guides/extensions/curator/metadata_curation.md @@ -272,6 +272,139 @@ else: print("No validation results available. The Grid session must be exported to generate validation results.") ``` +### Example: Getting data into a Grid for a file-based workflow + +The following example is for file-based curation. +It assumes your data is in a CSV file where each column is a property. + +Here is the csv used in the example: + +```csv +SampleID,Component,PatientID,TissueStatus +id1,biospecimen,pid1,Healthy +id1,biospecimen,pid2,Malignent +``` + +```python +import pandas as pd +import uuid +from synapseclient import Synapse +from synapseclient.models import CurationTask, Folder, File +from synapseclient.core.utils import make_bogus_data_file +from synapseclient.extensions.curator import create_file_based_metadata_task + +# 1. Replace all these values with your own information +PROJECT_ID = "syn68175188" +FOLDER_NAME = f"Biospecimen Curation Folder {uuid.uuid4().hex[:8]}" +CSV_PATH = "biospecimen.csv" +JSON_SCHEMA_URI = "dpetest-test.schematic.Biospecimen" +CURATION_TASK_NAME = f"File-based curation task for biospecimens {uuid.uuid4().hex[:8]}" +INSTRUCTIONS = "Please curate the biospecimen information." + +# 2. Login to Synapse +syn = Synapse() +syn.login() + +# 3. Get annotations from CSV file +annotations = pd.read_csv(CSV_PATH).to_dict(orient="records") + +# 4. Create a folder to store the file that will be used for curation +folder = Folder(name=FOLDER_NAME, parent_id=PROJECT_ID) +folder = folder.store(synapse_client=syn) +print(f"Created folder with ID: {folder.id}") + +# 5. Create files, annotate them, and store them in the Synapse folder +path_to_file1 = make_bogus_data_file(n=5) +file1 = File(path=path_to_file1, parent_id=folder.id, annotations=annotations[0]) +file1 = file1.store(synapse_client=syn) +print(f"Created file with ID: {file1.id}") +path_to_file2 = make_bogus_data_file(n=5) +file2 = File(path=path_to_file2, parent_id=folder.id, annotations=annotations[1]) +file2 = file2.store(synapse_client=syn) +print(f"Created file with ID: {file2.id}") + +# 6. Create EntityView and CurationTask +view_id, task_id = create_file_based_metadata_task( + folder_id=folder.id, + curation_task_name=CURATION_TASK_NAME, + instructions=INSTRUCTIONS, + schema_uri=JSON_SCHEMA_URI, + synapse_client=syn, +) +print(f"Created EntityView with ID: {view_id}") +print(f"Created CurationTask with ID: {task_id}") + +# 7. Cleanup all Synapse entities created +folder.delete(synapse_client=syn) +CurationTask(task_id=task_id).delete(synapse_client=syn, delete_source=True) +``` + +### Example: Getting data into a Grid for a record-based workflow + +The following example is for record-based curation. +It assumes your data is in a CSV file where each column is a property. + +Here is the csv used in the example: + +```csv +Sex,Component,Diagnosis,PatientID,CancerType,YearofBirth,FamilyHistory +Male,Patient,Healthy,id1,,1970, +Female,Patient,Healthy,id2,,1980, +``` + +```python +import uuid +from synapseclient import Synapse +from synapseclient.models import Folder +from synapseclient.extensions.curator import create_record_based_metadata_task + +# 1. Replace all these values with your own information +PROJECT_ID = "syn68175188" +FOLDER_NAME = f"Patient Curation Folder {uuid.uuid4().hex[:8]}" +CSV_PATH = "patient.csv" +JSON_SCHEMA_URI = "dpetest-test.schematic.Patient" +CURATION_TASK_NAME = f"Record-based curation task for patients {uuid.uuid4().hex[:8]}" +INSTRUCTIONS = "Please curate the patient information." +RECORD_SET_NAME = f"Patient Record Set {uuid.uuid4().hex[:8]}" +RECORD_SET_DESCRIPTION = "A record set for patients created for a record-based curation task example." +UPSERT_KEYS = ["PatientID"] + +# 2. Login to Synapse +syn = Synapse() +syn.login() + +# 3. Create a folder to store the RecordSet in +folder = Folder(name=FOLDER_NAME, parent_id=PROJECT_ID) +folder = folder.store(synapse_client=syn) +print(f"Created folder with ID: {folder.id}") + +# 4. Create RecordSet, CurationTask, and Grid +record_set, task, grid = create_record_based_metadata_task( + folder_id=folder.id, + record_set_name=RECORD_SET_NAME, + record_set_description=RECORD_SET_DESCRIPTION, + curation_task_name=CURATION_TASK_NAME, + upsert_keys=UPSERT_KEYS, + instructions=INSTRUCTIONS, + schema_uri=JSON_SCHEMA_URI, + synapse_client=syn, +) +print(f"Created RecordSet with ID: {record_set.id}") +print(f"Created CurationTask with ID: {task.task_id}") +print(f"Created Grid with ID: {grid.session_id}") + +# 5. Store the record set with the path to the CSV file as an annotation. +# TODO: https://sagebionetworks.jira.com/browse/SYNPY-1781 +# Once SYNPY-1781 is finished add code here for uploading data from a CSV file into a grid session. +record_set.get(synapse_client=syn) +record_set.path = CSV_PATH +record_set = record_set.store(synapse_client=syn) + +# 6. Cleanup all Synapse entities created +folder.delete(synapse_client=syn) +task.delete(synapse_client=syn, delete_source=True) +``` + ### Example: Complete validation workflow for animal study metadata This example demonstrates the full workflow from creating a curation task through validating the submitted metadata: