From 9db5568d27d849d177d015e5dd6582a644bd0ed4 Mon Sep 17 00:00:00 2001 From: jniestroy Date: Thu, 12 Mar 2026 09:13:47 -0400 Subject: [PATCH 1/6] annotated thigns --- fairscape_models/__init__.py | 4 +- fairscape_models/annotated_computation.py | 71 ++++++++++++++++++++ fairscape_models/annotated_evidence_graph.py | 56 +++++++++++++++ 3 files changed, 130 insertions(+), 1 deletion(-) create mode 100644 fairscape_models/annotated_computation.py create mode 100644 fairscape_models/annotated_evidence_graph.py diff --git a/fairscape_models/__init__.py b/fairscape_models/__init__.py index 4a29b4c..2151792 100644 --- a/fairscape_models/__init__.py +++ b/fairscape_models/__init__.py @@ -12,4 +12,6 @@ from fairscape_models.rocrate import ROCrateV1_2, ROCrateMetadataElem, ROCrateMetadataFileElem, ROCrateDistribution, GenericMetadataElem from fairscape_models.sample import Sample from fairscape_models.model_card import ModelCard -from fairscape_models.experiment import Experiment \ No newline at end of file +from fairscape_models.experiment import Experiment +from fairscape_models.annotated_computation import AnnotatedComputation, CodeAnalysis, DatasetSummary +from fairscape_models.annotated_evidence_graph import AnnotatedEvidenceGraph \ No newline at end of file diff --git a/fairscape_models/annotated_computation.py b/fairscape_models/annotated_computation.py new file mode 100644 index 0000000..757f926 --- /dev/null +++ b/fairscape_models/annotated_computation.py @@ -0,0 +1,71 @@ +from pydantic import BaseModel, Field, ConfigDict, model_validator +from typing import Optional, List, Union + +from fairscape_models.fairscape_base import IdentifierValue, ANNOTATED_COMPUTATION_TYPE +from fairscape_models.digital_object import DigitalObject + + +class CodeAnalysis(BaseModel): + """Analysis of a software entity used in the computation.""" + model_config = ConfigDict(extra="allow", populate_by_name=True) + + software: IdentifierValue + name: Optional[str] = Field(default=None) + summary: str + keyFunctions: Optional[List[str]] = Field(default=None) + concerns: Optional[List[str]] = Field(default=None) + + +class DatasetSummary(BaseModel): + """Summary of a dataset's role in the computation.""" + model_config = ConfigDict(extra="allow", populate_by_name=True) + + dataset: IdentifierValue + name: Optional[str] = Field(default=None) + role: Optional[str] = Field(default=None) + description: Optional[str] = Field(default=None) + + +class AnnotatedComputation(DigitalObject): + """LLM-generated annotation of a single evi:Computation step. + + A DigitalObject (Document) that annotates an evi:Computation. + The original Computation stays in the graph in its original form; + this annotation points to it via evi:annotates. + """ + metadataType: Optional[Union[List[str], str]] = Field( + default=[ + 'prov:Entity', + "https://w3id.org/EVI#Annotation", + "https://w3id.org/EVI#AnnotatedComputation", + ], + alias="@type", + ) + additionalType: Optional[str] = Field(default=ANNOTATED_COMPUTATION_TYPE) + + # Points to the original Computation this annotates + annotates: IdentifierValue = Field(..., alias="evi:annotates") + + # LLM-generated content + stepSummary: str = Field(..., alias="evi:stepSummary") + codeAnalysis: Optional[List[CodeAnalysis]] = Field(default=[], alias="evi:codeAnalysis") + inputSummaries: Optional[List[DatasetSummary]] = Field(default=[], alias="evi:inputSummaries") + outputSummaries: Optional[List[DatasetSummary]] = Field(default=[], alias="evi:outputSummaries") + concerns: Optional[List[str]] = Field(default=[], alias="evi:concerns") + + # Provenance of the annotation itself + llmModel: str = Field(alias="evi:llmModel") + llmTemperature: Optional[float] = Field(default=None, alias="evi:llmTemperature") + dateCreated: str + interpreterVersion: Optional[str] = Field(default=None, alias="evi:interpreterVersion") + + @model_validator(mode='after') + def populate_prov_fields(self): + """Auto-populate PROV-O fields.""" + # prov:wasDerivedFrom -> the computation being annotated + self.wasDerivedFrom = [self.annotates] + + # prov:wasAttributedTo -> the LLM model + self.wasAttributedTo = [self.llmModel] + + return self diff --git a/fairscape_models/annotated_evidence_graph.py b/fairscape_models/annotated_evidence_graph.py new file mode 100644 index 0000000..f12ae72 --- /dev/null +++ b/fairscape_models/annotated_evidence_graph.py @@ -0,0 +1,56 @@ +from pydantic import Field, model_validator +from typing import Optional, List, Union, Dict, Any + +from fairscape_models.fairscape_base import IdentifierValue, ANNOTATED_EVIDENCE_GRAPH_TYPE +from fairscape_models.digital_object import DigitalObject + + +class AnnotatedEvidenceGraph(DigitalObject): + """Full annotated condensed evidence graph -- the graph-level LLM output. + + Contains all original crate entities plus AnnotatedComputation nodes + in a flat dict keyed by @id. Computation nodes are replaced by their + annotated supersets. DAG is reconstructable from cross-references + (generatedBy, usedDataset, evi:annotates, etc.). + """ + metadataType: Optional[Union[List[str], str]] = Field( + default=[ + 'prov:Entity', + "https://w3id.org/EVI#EvidenceGraph", + "https://w3id.org/EVI#AnnotatedEvidenceGraph", + ], + alias="@type", + ) + additionalType: Optional[str] = Field(default=ANNOTATED_EVIDENCE_GRAPH_TYPE) + + # Reference to the original evidence graph or RO-Crate root + annotates: IdentifierValue = Field(..., alias="evi:annotates") + + # Flat entity lookup -- all entities keyed by ARK @id + graph: Dict[str, Any] = Field(..., alias="@graph") + + # Graph-level LLM outputs + executiveSummary: str = Field(..., alias="evi:executiveSummary") + narrativeSummary: str = Field(..., alias="evi:narrativeSummary") + keyFindings: Optional[List[str]] = Field(default=[], alias="evi:keyFindings") + concerns: Optional[List[str]] = Field(default=[], alias="evi:concerns") + + # Quick index of all AnnotatedComputation @ids in the graph + stepAnnotations: Optional[List[IdentifierValue]] = Field(default=[], alias="evi:stepAnnotations") + + # Provenance of the graph-level analysis + llmModel: str = Field(alias="evi:llmModel") + llmTemperature: Optional[float] = Field(default=None, alias="evi:llmTemperature") + dateCreated: str + interpreterVersion: Optional[str] = Field(default=None, alias="evi:interpreterVersion") + + @model_validator(mode='after') + def populate_prov_fields(self): + """Auto-populate PROV-O fields.""" + # prov:wasDerivedFrom -> the original evidence graph + self.wasDerivedFrom = [self.annotates] + + # prov:wasAttributedTo -> the LLM + self.wasAttributedTo = [self.llmModel] + + return self From 7b856b3619d7511dc89b60c4b3712967ea8d0b24 Mon Sep 17 00:00:00 2001 From: jniestroy Date: Thu, 12 Mar 2026 09:14:08 -0400 Subject: [PATCH 2/6] more --- fairscape_models/computation.py | 1 + fairscape_models/fairscape_base.py | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/fairscape_models/computation.py b/fairscape_models/computation.py index 2391f23..114a61a 100644 --- a/fairscape_models/computation.py +++ b/fairscape_models/computation.py @@ -14,6 +14,7 @@ class Computation(Activity): usedSoftware: Optional[List[IdentifierValue]] = Field(default=[]) usedMLModel: Optional[List[IdentifierValue]] = Field(default=[]) usedDataset: Optional[List[IdentifierValue]] = Field(default=[]) + annotatedBy: Optional[List[IdentifierValue]] = Field(default=[], alias="evi:annotatedBy") @model_validator(mode='after') def populate_prov_fields(self): diff --git a/fairscape_models/fairscape_base.py b/fairscape_models/fairscape_base.py index 2782621..f071f51 100644 --- a/fairscape_models/fairscape_base.py +++ b/fairscape_models/fairscape_base.py @@ -23,6 +23,8 @@ MLMODEL_TYPE = "MLModel" COMPUTATION_TYPE = "Computation" ANNOTATION_TYPE = "Annotation" +ANNOTATED_COMPUTATION_TYPE = "AnnotatedComputation" +ANNOTATED_EVIDENCE_GRAPH_TYPE = "AnnotatedEvidenceGraph" ROCRATE_TYPE = "ROCrate" # TODO get from config @@ -51,6 +53,10 @@ "@id": "https://w3id.org/EVI#generated", "@type": "@id" }, + "annotates": { + "@id": "https://w3id.org/EVI#annotates", + "@type": "@id" + }, "hasDistribution": { "@id": "https://w3id.org/EVI#hasDistribution", "@type": "@id" @@ -65,6 +71,8 @@ class ClassType(str, Enum): ANNOTATION = 'Annotation' SCHEMA = 'Schema' EVIDENCE_GRAPH = 'EvidenceGraph' + ANNOTATED_COMPUTATION = 'AnnotatedComputation' + ANNOTATED_EVIDENCE_GRAPH = 'AnnotatedEvidenceGraph' ROCRATE = 'ROCrate' #TODO: Add ROCrate concept to EVI ontology and publish a new version def normalize_class_type(value: Union[str, ClassType]) -> ClassType: From 0e614deafeabbf4e7bc693ff935c8bd7911e4fb1 Mon Sep 17 00:00:00 2001 From: jniestroy Date: Wed, 18 Mar 2026 09:01:55 -0400 Subject: [PATCH 3/6] move things --- fairscape_models/rocrate.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fairscape_models/rocrate.py b/fairscape_models/rocrate.py index bc3a239..9e3cfda 100644 --- a/fairscape_models/rocrate.py +++ b/fairscape_models/rocrate.py @@ -131,8 +131,7 @@ class ROCrateMetadataElem(BaseModel): humanSubjects: Optional[str] = Field(alias="humanSubjects", default=None) humanSubjectResearch: Optional[str] = Field(default=None) dataGovernanceCommittee: Optional[str] = Field(default=None) - completeness: Optional[str] = Field(alias="completeness", default=None) - prohibitedUses: Optional[str] = Field(alias="prohibitedUses", default=None) + # RAI fields rai_data_limitations: Optional[str] = Field(alias="rai:dataLimitations", default=None) @@ -154,6 +153,8 @@ class ROCrateMetadataElem(BaseModel): rai_data_social_impact: Optional[str] = Field(alias="rai:dataSocialImpact", default=None) rai_annotations_per_item: Optional[str] = Field(alias="rai:annotationsPerItem", default=None) rai_machine_annotation_tools: Optional[List[str]] = Field(alias="rai:machineAnnotationTools", default=None) + completeness: Optional[str] = Field(alias="completeness", default=None) + prohibitedUses: Optional[str] = Field(alias="prohibitedUses", default=None) # Aggregated metrics for AI-Ready scoring (roll-up properties from sub-crates) evi_dataset_count: Optional[int] = Field(alias="evi:datasetCount", default=None) From c5097ac8345864385c46c433987ce21623f1fe5d Mon Sep 17 00:00:00 2001 From: jniestroy Date: Wed, 18 Mar 2026 11:15:32 -0400 Subject: [PATCH 4/6] irb --- fairscape_models/__init__.py | 2 +- .../conversion/mapping/FairscapeDatasheet.py | 20 +++++++++-- .../conversion/models/FairscapeDatasheet.py | 8 ++--- fairscape_models/rocrate.py | 36 ++++++++++++++++++- 4 files changed, 57 insertions(+), 9 deletions(-) diff --git a/fairscape_models/__init__.py b/fairscape_models/__init__.py index 2151792..ac8cdc5 100644 --- a/fairscape_models/__init__.py +++ b/fairscape_models/__init__.py @@ -9,7 +9,7 @@ from fairscape_models.fairscape_base import IdentifierValue, IdentifierPropertyValue, DEFAULT_ARK_NAAN, DEFAULT_LICENSE, DEFAULT_CONTEXT from fairscape_models.medical_condition import MedicalCondition from fairscape_models.schema import Schema -from fairscape_models.rocrate import ROCrateV1_2, ROCrateMetadataElem, ROCrateMetadataFileElem, ROCrateDistribution, GenericMetadataElem +from fairscape_models.rocrate import ROCrateV1_2, ROCrateMetadataElem, ROCrateMetadataFileElem, ROCrateDistribution, GenericMetadataElem, IRB, ContactPoint, PostalAddress from fairscape_models.sample import Sample from fairscape_models.model_card import ModelCard from fairscape_models.experiment import Experiment diff --git a/fairscape_models/conversion/mapping/FairscapeDatasheet.py b/fairscape_models/conversion/mapping/FairscapeDatasheet.py index 9bcb67c..6d29984 100644 --- a/fairscape_models/conversion/mapping/FairscapeDatasheet.py +++ b/fairscape_models/conversion/mapping/FairscapeDatasheet.py @@ -38,6 +38,20 @@ def _parser(prop_list: Any) -> Optional[str]: return default return _parser +def _bool_to_yes_no(value: Any) -> str: + """Convert a boolean to 'Yes'/'No' string for display.""" + if isinstance(value, bool): + return "Yes" if value else "No" + return str(value) + +def _irb_passthrough(value: Any) -> Any: + """Pass IRB value through as-is — string or dict (structured IRB).""" + if isinstance(value, dict): + return value + if isinstance(value, str): + return value + return None + def _extract_id(value: Any) -> Optional[str]: if isinstance(value, dict): return value.get("@id") @@ -80,9 +94,9 @@ def _extract_id(value: Any) -> Optional[str]: "human_subject": {"source_key": "humanSubjects", "fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("Human Subject")}, "human_subject_research": {"source_key": "humanSubjectResearch", "fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("Human Subject Research", "")}, "human_subject_exemptions": {"source_key": "humanSubjectExemption", "fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("Human Subjects Exemptions", "")}, - "deidentified_samples": {"source_key": "deidentified", "fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("De-identified Samples", "")}, - "fda_regulated": {"source_key": "fdaRegulated", "fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("FDA Regulated", "")}, - "irb": {"source_key": "irb", "fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("IRB", "")}, + "deidentified_samples": {"source_key": "deidentified", "parser": _bool_to_yes_no, "fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("De-identified Samples", "")}, + "fda_regulated": {"source_key": "fdaRegulated", "parser": _bool_to_yes_no, "fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("FDA Regulated", "")}, + "irb": {"source_key": "irb", "parser": _irb_passthrough, "fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("IRB", "")}, "irb_protocol_id": {"source_key": "irbProtocolId", "fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("IRB Protocol ID", "")}, "data_governance": {"source_key": "dataGovernanceCommittee","fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("Data Governance Committee")}, "completeness": {"source_key": "completeness", "fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("Completeness")}, diff --git a/fairscape_models/conversion/models/FairscapeDatasheet.py b/fairscape_models/conversion/models/FairscapeDatasheet.py index 93feba0..03b7933 100644 --- a/fairscape_models/conversion/models/FairscapeDatasheet.py +++ b/fairscape_models/conversion/models/FairscapeDatasheet.py @@ -1,5 +1,5 @@ from typing import Any, Dict, List, Optional, Union, Callable, Tuple -from pydantic import BaseModel, Field, ConfigDict +from pydantic import BaseModel, Field, ConfigDict, model_validator ########################################################################## # --- Main Document Subsections ------------------------------------------ @@ -40,10 +40,10 @@ class OverviewSection(BaseModel): human_subject: Optional[str] = None human_subject_research: Optional[str] = None human_subject_exemptions: Optional[str] = None - deidentified_samples: Optional[str] = None - fda_regulated: Optional[str] = None + deidentified_samples: Optional[Union[str, bool]] = None + fda_regulated: Optional[Union[str, bool]] = None confidentiality_level: Optional[str] = None - irb: Optional[str] = None + irb: Optional[Union[str, Dict[str, Any]]] = None irb_protocol_id: Optional[str] = None ethical_review: Optional[str] = None diff --git a/fairscape_models/rocrate.py b/fairscape_models/rocrate.py index 9e3cfda..e250de0 100644 --- a/fairscape_models/rocrate.py +++ b/fairscape_models/rocrate.py @@ -20,6 +20,35 @@ from fairscape_models.digital_object import DigitalObject from fairscape_models._version import __version__ +class ContactPoint(BaseModel): + """Schema.org ContactPoint for structured contact information.""" + metadataType: str = Field(default="ContactPoint", alias="@type") + contactType: Optional[str] = Field(default=None) + email: Optional[str] = Field(default=None) + telephone: Optional[str] = Field(default=None) + model_config = ConfigDict(extra="allow", populate_by_name=True) + + +class PostalAddress(BaseModel): + """Schema.org PostalAddress for structured address information.""" + metadataType: str = Field(default="PostalAddress", alias="@type") + streetAddress: Optional[str] = Field(default=None) + addressLocality: Optional[str] = Field(default=None) + addressRegion: Optional[str] = Field(default=None) + postalCode: Optional[str] = Field(default=None) + addressCountry: Optional[str] = Field(default=None) + model_config = ConfigDict(extra="allow", populate_by_name=True) + + +class IRB(BaseModel): + """Institutional Review Board with structured contact and address info.""" + metadataType: str = Field(default="IRB", alias="@type") + name: str + contactPoint: Optional[ContactPoint] = Field(default=None) + address: Optional[PostalAddress] = Field(default=None) + model_config = ConfigDict(extra="allow", populate_by_name=True) + + class GenericMetadataElem(BaseModel): """Generic Metadata Element of an ROCrate""" guid: str = Field(alias="@id") @@ -123,7 +152,7 @@ class ROCrateMetadataElem(BaseModel): # Compliance / ethics ethicalReview: Optional[str] = Field(default=None) confidentialityLevel: Optional[str] = Field(default=None) - irb: Optional[str] = Field(default=None) + irb: Optional[Union[str, IRB]] = Field(default=None) irbProtocolId: Optional[str] = Field(default=None) humanSubjectExemption: Optional[str] = Field(default=None) fdaRegulated: Optional[bool] = Field(default=None) @@ -131,6 +160,11 @@ class ROCrateMetadataElem(BaseModel): humanSubjects: Optional[str] = Field(alias="humanSubjects", default=None) humanSubjectResearch: Optional[str] = Field(default=None) dataGovernanceCommittee: Optional[str] = Field(default=None) + + # Checksums + md5: Optional[str] = Field(default=None, description="MD5 checksum of the digital object content") + hash: Optional[str] = Field(default=None, description="Hash of the digital object content (if not MD5)") + sha256: Optional[str] = Field(default=None, description="SHA-256 checksum of the digital object content") # RAI fields From 2cb9b66875aad046fe5add304e3e03b1e7509287 Mon Sep 17 00:00:00 2001 From: jniestroy Date: Wed, 18 Mar 2026 11:15:55 -0400 Subject: [PATCH 5/6] hash --- fairscape_models/conversion/mapping/AIReady.py | 2 +- fairscape_models/digital_object.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/fairscape_models/conversion/mapping/AIReady.py b/fairscape_models/conversion/mapping/AIReady.py index 434bbe5..c9b77f5 100644 --- a/fairscape_models/conversion/mapping/AIReady.py +++ b/fairscape_models/conversion/mapping/AIReady.py @@ -295,7 +295,7 @@ def _score_pre_model(pre_model: PreModelExplainabilityScore, root_data: Dict[str if "Dataset" in entity_type or "Software" in entity_type or "ROCrate" in entity_type: total += 1 - if entity.get("md5") or entity.get("MD5"): + if entity.get("md5") or entity.get("MD5" or "") or entity.get("sha256") or entity.get("SHA256" or "") or entity.get("hash") or entity.get("hash" or ""): with_checksum += 1 if total > 0 and with_checksum > 0: diff --git a/fairscape_models/digital_object.py b/fairscape_models/digital_object.py index cc741fe..a0d1d8e 100644 --- a/fairscape_models/digital_object.py +++ b/fairscape_models/digital_object.py @@ -17,6 +17,9 @@ class DigitalObject(BaseModel): contentUrl: Optional[Union[str, List[str]]] = Field(default=None) isPartOf: Optional[List[IdentifierValue]] = Field(default=[]) usedByComputation: Optional[List[IdentifierValue]] = Field(default=[]) + md5: Optional[str] = Field(default=None, description="MD5 checksum of the digital object content") + hash: Optional[str] = Field(default=None, description="Hash of the digital object content (if not MD5)") + sha256: Optional[str] = Field(default=None, description="SHA-256 checksum of the digital object content") fairscapeVersion: str = __version__ # PROV-O fields (auto-populated) From 825ba98294f3e52888ba397291df1f3da7219eca Mon Sep 17 00:00:00 2001 From: jniestroy Date: Wed, 18 Mar 2026 11:20:46 -0400 Subject: [PATCH 6/6] full coverage --- tests/test_annotated.py | 146 +++++++++++++++ tests/test_rocrate_validation.py | 296 ++++++++++++++++++++++++++++++- 2 files changed, 440 insertions(+), 2 deletions(-) create mode 100644 tests/test_annotated.py diff --git a/tests/test_annotated.py b/tests/test_annotated.py new file mode 100644 index 0000000..57c5164 --- /dev/null +++ b/tests/test_annotated.py @@ -0,0 +1,146 @@ +# tests/test_annotated.py + +import pytest +from fairscape_models.annotated_computation import AnnotatedComputation, CodeAnalysis, DatasetSummary +from fairscape_models.annotated_evidence_graph import AnnotatedEvidenceGraph + + +def _make_annotated_computation(**overrides): + base = { + "@id": "ark:59852/annotation-001", + "name": "Annotation of step 1", + "author": "claude-opus-4", + "description": "LLM annotation of computation step 1", + "evi:annotates": {"@id": "ark:59852/computation-001"}, + "evi:stepSummary": "This step loads raw data and normalises it.", + "evi:llmModel": "claude-opus-4", + "dateCreated": "2026-03-18", + } + base.update(overrides) + return AnnotatedComputation.model_validate(base) + + +def _make_annotated_evidence_graph(**overrides): + base = { + "@id": "ark:59852/aeg-001", + "name": "Annotated Evidence Graph", + "author": "claude-opus-4", + "description": "Full annotated evidence graph for the crate", + "evi:annotates": {"@id": "ark:59852/rocrate-001"}, + "@graph": { + "ark:59852/computation-001": {"@id": "ark:59852/computation-001", "@type": "Computation"}, + }, + "evi:executiveSummary": "Pipeline processes raw data into embeddings.", + "evi:narrativeSummary": "The pipeline consists of three steps.", + "evi:llmModel": "claude-opus-4", + "dateCreated": "2026-03-18", + } + base.update(overrides) + return AnnotatedEvidenceGraph.model_validate(base) + + +class TestAnnotatedComputation: + + def test_prov_fields_auto_populated(self): + ac = _make_annotated_computation() + # wasDerivedFrom should point to the annotated computation + assert len(ac.wasDerivedFrom) == 1 + assert ac.wasDerivedFrom[0].guid == "ark:59852/computation-001" + # wasAttributedTo should point to the LLM model + assert ac.wasAttributedTo == ["claude-opus-4"] + + def test_minimal_fields(self): + ac = _make_annotated_computation() + assert ac.name == "Annotation of step 1" + assert ac.stepSummary == "This step loads raw data and normalises it." + assert ac.llmModel == "claude-opus-4" + assert ac.annotates.guid == "ark:59852/computation-001" + + def test_with_code_analysis(self): + ac = _make_annotated_computation(**{ + "evi:codeAnalysis": [ + { + "software": {"@id": "ark:59852/software-001"}, + "summary": "Reads CSV files and applies z-score normalisation.", + "keyFunctions": ["pandas.read_csv", "scipy.stats.zscore"], + "concerns": ["No null-value handling"], + } + ], + }) + assert len(ac.codeAnalysis) == 1 + assert isinstance(ac.codeAnalysis[0], CodeAnalysis) + assert ac.codeAnalysis[0].software.guid == "ark:59852/software-001" + assert "pandas.read_csv" in ac.codeAnalysis[0].keyFunctions + + def test_with_dataset_summaries(self): + ac = _make_annotated_computation(**{ + "evi:inputSummaries": [ + { + "dataset": {"@id": "ark:59852/dataset-001"}, + "name": "Raw data", + "role": "input", + "description": "Raw measurements", + } + ], + "evi:outputSummaries": [ + { + "dataset": {"@id": "ark:59852/dataset-002"}, + "role": "output", + } + ], + }) + assert len(ac.inputSummaries) == 1 + assert isinstance(ac.inputSummaries[0], DatasetSummary) + assert ac.inputSummaries[0].dataset.guid == "ark:59852/dataset-001" + assert len(ac.outputSummaries) == 1 + assert ac.outputSummaries[0].dataset.guid == "ark:59852/dataset-002" + + def test_serialization_roundtrip(self): + ac = _make_annotated_computation() + dumped = ac.model_dump(by_alias=True) + assert dumped["evi:stepSummary"] == "This step loads raw data and normalises it." + assert dumped["evi:annotates"]["@id"] == "ark:59852/computation-001" + assert dumped["evi:llmModel"] == "claude-opus-4" + restored = AnnotatedComputation.model_validate(dumped) + assert restored.stepSummary == ac.stepSummary + + +class TestAnnotatedEvidenceGraph: + + def test_prov_fields_auto_populated(self): + aeg = _make_annotated_evidence_graph() + # wasDerivedFrom should point to the annotated crate + assert len(aeg.wasDerivedFrom) == 1 + assert aeg.wasDerivedFrom[0].guid == "ark:59852/rocrate-001" + # wasAttributedTo should point to the LLM model + assert aeg.wasAttributedTo == ["claude-opus-4"] + + def test_minimal_fields(self): + aeg = _make_annotated_evidence_graph() + assert aeg.executiveSummary == "Pipeline processes raw data into embeddings." + assert aeg.narrativeSummary == "The pipeline consists of three steps." + assert aeg.annotates.guid == "ark:59852/rocrate-001" + assert "ark:59852/computation-001" in aeg.graph + + def test_with_optional_fields(self): + aeg = _make_annotated_evidence_graph(**{ + "evi:keyFindings": ["Finding 1", "Finding 2"], + "evi:concerns": ["Missing validation step"], + "evi:llmTemperature": 0.7, + "evi:interpreterVersion": "1.2.0", + "evi:stepAnnotations": [{"@id": "ark:59852/annotation-001"}], + }) + assert aeg.keyFindings == ["Finding 1", "Finding 2"] + assert aeg.concerns == ["Missing validation step"] + assert aeg.llmTemperature == 0.7 + assert aeg.interpreterVersion == "1.2.0" + assert len(aeg.stepAnnotations) == 1 + assert aeg.stepAnnotations[0].guid == "ark:59852/annotation-001" + + def test_serialization_roundtrip(self): + aeg = _make_annotated_evidence_graph() + dumped = aeg.model_dump(by_alias=True) + assert dumped["evi:executiveSummary"] == "Pipeline processes raw data into embeddings." + assert dumped["evi:annotates"]["@id"] == "ark:59852/rocrate-001" + restored = AnnotatedEvidenceGraph.model_validate(dumped) + assert restored.executiveSummary == aeg.executiveSummary diff --git a/tests/test_rocrate_validation.py b/tests/test_rocrate_validation.py index d94bd1d..a23d6bc 100644 --- a/tests/test_rocrate_validation.py +++ b/tests/test_rocrate_validation.py @@ -8,7 +8,10 @@ ROCrateMetadataElem, GenericMetadataElem, BioChemEntity, - MedicalCondition + MedicalCondition, + IRB, + ContactPoint, + PostalAddress, ) from fairscape_models.dataset import Dataset from fairscape_models.software import Software @@ -446,4 +449,293 @@ def test_clean_identifiers_with_experiment(): assert experiment.usedSample[0].guid == "ark:59852/test-sample" assert experiment.usedTreatment[0].guid == "ark:59852/test-treatment" assert experiment.usedStain[0].guid == "ark:59852/test-stain" - assert experiment.generated[0].guid == "ark:59852/test-result" \ No newline at end of file + assert experiment.generated[0].guid == "ark:59852/test-result" + + +# ── IRB class tests ───────────────────────────────────────────────────── + +class TestContactPoint: + """Tests for the ContactPoint model.""" + + def test_defaults(self): + cp = ContactPoint() + assert cp.metadataType == "ContactPoint" + assert cp.contactType is None + assert cp.email is None + assert cp.telephone is None + + def test_full(self): + cp = ContactPoint( + contactType="IRB Reliance and Compliance", + email="irbreliance@mgb.org", + telephone="+1-857-282-1900", + ) + assert cp.contactType == "IRB Reliance and Compliance" + assert cp.email == "irbreliance@mgb.org" + assert cp.telephone == "+1-857-282-1900" + + def test_alias_serialization(self): + cp = ContactPoint(email="test@example.com") + dumped = cp.model_dump(by_alias=True) + assert dumped["@type"] == "ContactPoint" + assert "metadataType" not in dumped + + def test_from_dict_with_alias(self): + cp = ContactPoint.model_validate({ + "@type": "ContactPoint", + "email": "test@example.com", + }) + assert cp.metadataType == "ContactPoint" + assert cp.email == "test@example.com" + + def test_extra_fields_allowed(self): + cp = ContactPoint.model_validate({ + "email": "test@example.com", + "url": "https://example.com", + }) + assert cp.email == "test@example.com" + + +class TestPostalAddress: + """Tests for the PostalAddress model.""" + + def test_defaults(self): + addr = PostalAddress() + assert addr.metadataType == "PostalAddress" + assert addr.streetAddress is None + assert addr.addressLocality is None + assert addr.addressRegion is None + assert addr.postalCode is None + assert addr.addressCountry is None + + def test_full(self): + addr = PostalAddress( + streetAddress="399 Revolution Drive, Suite 710", + addressLocality="Somerville", + addressRegion="MA", + postalCode="02145", + addressCountry="US", + ) + assert addr.streetAddress == "399 Revolution Drive, Suite 710" + assert addr.addressLocality == "Somerville" + assert addr.addressRegion == "MA" + assert addr.postalCode == "02145" + assert addr.addressCountry == "US" + + def test_alias_serialization(self): + addr = PostalAddress(addressLocality="Boston") + dumped = addr.model_dump(by_alias=True) + assert dumped["@type"] == "PostalAddress" + + def test_from_dict_with_alias(self): + addr = PostalAddress.model_validate({ + "@type": "PostalAddress", + "addressLocality": "Boston", + "addressRegion": "MA", + }) + assert addr.addressLocality == "Boston" + assert addr.addressRegion == "MA" + + +class TestIRB: + """Tests for the IRB model.""" + + def test_minimal(self): + irb = IRB(name="Test IRB") + assert irb.metadataType == "IRB" + assert irb.name == "Test IRB" + assert irb.contactPoint is None + assert irb.address is None + + def test_name_required(self): + with pytest.raises(ValidationError, match="name"): + IRB.model_validate({}) + + def test_full_mgb_irb(self): + """Full MGB IRB example from requirements.""" + irb = IRB( + name="Mass General Brigham Institutional Review Board (MGB IRB)", + contactPoint=ContactPoint( + contactType="IRB Reliance and Compliance", + email="irbreliance@mgb.org", + telephone="+1-857-282-1900", + ), + address=PostalAddress( + streetAddress="399 Revolution Drive, Suite 710", + addressLocality="Somerville", + addressRegion="MA", + postalCode="02145", + addressCountry="US", + ), + ) + assert irb.name == "Mass General Brigham Institutional Review Board (MGB IRB)" + assert irb.contactPoint.email == "irbreliance@mgb.org" + assert irb.contactPoint.telephone == "+1-857-282-1900" + assert irb.contactPoint.contactType == "IRB Reliance and Compliance" + assert irb.address.streetAddress == "399 Revolution Drive, Suite 710" + assert irb.address.addressLocality == "Somerville" + assert irb.address.postalCode == "02145" + + def test_alias_serialization(self): + irb = IRB(name="Test IRB") + dumped = irb.model_dump(by_alias=True) + assert dumped["@type"] == "IRB" + assert dumped["name"] == "Test IRB" + + def test_from_dict_with_alias(self): + irb = IRB.model_validate({ + "@type": "IRB", + "name": "Test IRB", + "contactPoint": { + "@type": "ContactPoint", + "email": "test@example.com", + }, + }) + assert irb.name == "Test IRB" + assert irb.contactPoint.email == "test@example.com" + + def test_nested_roundtrip(self): + """Serialize to dict and back — full nested structure survives.""" + irb = IRB( + name="MGB IRB", + contactPoint=ContactPoint(email="irb@mgb.org", telephone="+1-555-0100"), + address=PostalAddress(addressLocality="Somerville", addressRegion="MA"), + ) + dumped = irb.model_dump(by_alias=True) + restored = IRB.model_validate(dumped) + assert restored.name == irb.name + assert restored.contactPoint.email == irb.contactPoint.email + assert restored.contactPoint.telephone == irb.contactPoint.telephone + assert restored.address.addressLocality == irb.address.addressLocality + assert restored.address.addressRegion == irb.address.addressRegion + + def test_extra_fields_allowed(self): + irb = IRB.model_validate({ + "name": "Test IRB", + "irbNumber": "IRB-2024-001", + }) + assert irb.name == "Test IRB" + + +class TestROCrateMetadataElemIRB: + """Tests for IRB as a property on ROCrateMetadataElem.""" + + def test_irb_as_string(self): + elem = _minimal_rocrate_elem(irb="MGB IRB") + assert elem.irb == "MGB IRB" + + def test_irb_as_none(self): + elem = _minimal_rocrate_elem() + assert elem.irb is None + + def test_irb_as_structured_class(self): + elem = _minimal_rocrate_elem(irb={ + "@type": "IRB", + "name": "Mass General Brigham IRB", + "contactPoint": { + "@type": "ContactPoint", + "contactType": "IRB Reliance and Compliance", + "email": "irbreliance@mgb.org", + "telephone": "+1-857-282-1900", + }, + "address": { + "@type": "PostalAddress", + "streetAddress": "399 Revolution Drive, Suite 710", + "addressLocality": "Somerville", + "addressRegion": "MA", + "postalCode": "02145", + "addressCountry": "US", + }, + }) + assert isinstance(elem.irb, IRB) + assert elem.irb.name == "Mass General Brigham IRB" + assert elem.irb.contactPoint.email == "irbreliance@mgb.org" + assert elem.irb.address.addressLocality == "Somerville" + + def test_irb_structured_serialization(self): + """Structured IRB round-trips through ROCrateMetadataElem.""" + elem = _minimal_rocrate_elem(irb={ + "@type": "IRB", + "name": "Test IRB", + "contactPoint": {"@type": "ContactPoint", "email": "irb@test.edu"}, + }) + dumped = elem.model_dump(by_alias=True) + irb_data = dumped["irb"] + assert irb_data["@type"] == "IRB" + assert irb_data["name"] == "Test IRB" + assert irb_data["contactPoint"]["email"] == "irb@test.edu" + + def test_irb_string_serialization(self): + """String IRB round-trips through ROCrateMetadataElem.""" + elem = _minimal_rocrate_elem(irb="Simple IRB Name") + dumped = elem.model_dump(by_alias=True) + assert dumped["irb"] == "Simple IRB Name" + + def test_irb_in_full_rocrate(self): + """IRB class works inside a full ROCrate validation.""" + data = { + "@context": {}, + "@graph": [ + { + "@id": "ro-crate-metadata.json", + "@type": "CreativeWork", + "conformsTo": {"@id": "https://w3id.org/ro/crate/1.1"}, + "about": {"@id": "ark:59852/irb-crate"}, + }, + { + "@id": "ark:59852/irb-crate", + "@type": ["Dataset", "https://w3id.org/EVI#ROCrate"], + "name": "IRB Test Crate", + "description": "Crate with structured IRB", + "keywords": [], + "version": "1.0", + "author": "tester", + "license": "MIT", + "hasPart": [], + "irb": { + "@type": "IRB", + "name": "MGB IRB", + "contactPoint": { + "@type": "ContactPoint", + "email": "irb@mgb.org", + }, + }, + "irbProtocolId": "2024-P000123", + }, + ], + } + rocrate = ROCrateV1_2.model_validate(data) + meta = rocrate.getCrateMetadata() + assert isinstance(meta.irb, IRB) + assert meta.irb.name == "MGB IRB" + assert meta.irb.contactPoint.email == "irb@mgb.org" + assert meta.irbProtocolId == "2024-P000123" + + def test_irb_string_in_full_rocrate(self): + """String IRB still works inside a full ROCrate.""" + data = { + "@context": {}, + "@graph": [ + { + "@id": "ro-crate-metadata.json", + "@type": "CreativeWork", + "conformsTo": {"@id": "https://w3id.org/ro/crate/1.1"}, + "about": {"@id": "ark:59852/irb-crate"}, + }, + { + "@id": "ark:59852/irb-crate", + "@type": ["Dataset", "https://w3id.org/EVI#ROCrate"], + "name": "IRB Test Crate", + "description": "Crate with string IRB", + "keywords": [], + "version": "1.0", + "author": "tester", + "license": "MIT", + "hasPart": [], + "irb": "Mass General Brigham IRB", + }, + ], + } + rocrate = ROCrateV1_2.model_validate(data) + meta = rocrate.getCrateMetadata() + assert meta.irb == "Mass General Brigham IRB" \ No newline at end of file