Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions fairscape_models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@
from fairscape_models.fairscape_base import IdentifierValue, IdentifierPropertyValue, DEFAULT_ARK_NAAN, DEFAULT_LICENSE, DEFAULT_CONTEXT
from fairscape_models.medical_condition import MedicalCondition
from fairscape_models.schema import Schema
from fairscape_models.rocrate import ROCrateV1_2, ROCrateMetadataElem, ROCrateMetadataFileElem, ROCrateDistribution, GenericMetadataElem
from fairscape_models.rocrate import ROCrateV1_2, ROCrateMetadataElem, ROCrateMetadataFileElem, ROCrateDistribution, GenericMetadataElem, IRB, ContactPoint, PostalAddress
from fairscape_models.sample import Sample
from fairscape_models.model_card import ModelCard
from fairscape_models.experiment import Experiment
from fairscape_models.experiment import Experiment
from fairscape_models.annotated_computation import AnnotatedComputation, CodeAnalysis, DatasetSummary
from fairscape_models.annotated_evidence_graph import AnnotatedEvidenceGraph
71 changes: 71 additions & 0 deletions fairscape_models/annotated_computation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from pydantic import BaseModel, Field, ConfigDict, model_validator
from typing import Optional, List, Union

from fairscape_models.fairscape_base import IdentifierValue, ANNOTATED_COMPUTATION_TYPE
from fairscape_models.digital_object import DigitalObject


class CodeAnalysis(BaseModel):
"""Analysis of a software entity used in the computation."""
model_config = ConfigDict(extra="allow", populate_by_name=True)

software: IdentifierValue
name: Optional[str] = Field(default=None)
summary: str
keyFunctions: Optional[List[str]] = Field(default=None)
concerns: Optional[List[str]] = Field(default=None)


class DatasetSummary(BaseModel):
"""Summary of a dataset's role in the computation."""
model_config = ConfigDict(extra="allow", populate_by_name=True)

dataset: IdentifierValue
name: Optional[str] = Field(default=None)
role: Optional[str] = Field(default=None)
description: Optional[str] = Field(default=None)


class AnnotatedComputation(DigitalObject):
"""LLM-generated annotation of a single evi:Computation step.

A DigitalObject (Document) that annotates an evi:Computation.
The original Computation stays in the graph in its original form;
this annotation points to it via evi:annotates.
"""
metadataType: Optional[Union[List[str], str]] = Field(
default=[
'prov:Entity',
"https://w3id.org/EVI#Annotation",
"https://w3id.org/EVI#AnnotatedComputation",
],
alias="@type",
)
additionalType: Optional[str] = Field(default=ANNOTATED_COMPUTATION_TYPE)

# Points to the original Computation this annotates
annotates: IdentifierValue = Field(..., alias="evi:annotates")

# LLM-generated content
stepSummary: str = Field(..., alias="evi:stepSummary")
codeAnalysis: Optional[List[CodeAnalysis]] = Field(default=[], alias="evi:codeAnalysis")
inputSummaries: Optional[List[DatasetSummary]] = Field(default=[], alias="evi:inputSummaries")
outputSummaries: Optional[List[DatasetSummary]] = Field(default=[], alias="evi:outputSummaries")
concerns: Optional[List[str]] = Field(default=[], alias="evi:concerns")

# Provenance of the annotation itself
llmModel: str = Field(alias="evi:llmModel")
llmTemperature: Optional[float] = Field(default=None, alias="evi:llmTemperature")
dateCreated: str
interpreterVersion: Optional[str] = Field(default=None, alias="evi:interpreterVersion")

@model_validator(mode='after')
def populate_prov_fields(self):
"""Auto-populate PROV-O fields."""
# prov:wasDerivedFrom -> the computation being annotated
self.wasDerivedFrom = [self.annotates]

# prov:wasAttributedTo -> the LLM model
self.wasAttributedTo = [self.llmModel]

return self
56 changes: 56 additions & 0 deletions fairscape_models/annotated_evidence_graph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from pydantic import Field, model_validator
from typing import Optional, List, Union, Dict, Any

from fairscape_models.fairscape_base import IdentifierValue, ANNOTATED_EVIDENCE_GRAPH_TYPE
from fairscape_models.digital_object import DigitalObject


class AnnotatedEvidenceGraph(DigitalObject):
"""Full annotated condensed evidence graph -- the graph-level LLM output.

Contains all original crate entities plus AnnotatedComputation nodes
in a flat dict keyed by @id. Computation nodes are replaced by their
annotated supersets. DAG is reconstructable from cross-references
(generatedBy, usedDataset, evi:annotates, etc.).
"""
metadataType: Optional[Union[List[str], str]] = Field(
default=[
'prov:Entity',
"https://w3id.org/EVI#EvidenceGraph",
"https://w3id.org/EVI#AnnotatedEvidenceGraph",
],
alias="@type",
)
additionalType: Optional[str] = Field(default=ANNOTATED_EVIDENCE_GRAPH_TYPE)

# Reference to the original evidence graph or RO-Crate root
annotates: IdentifierValue = Field(..., alias="evi:annotates")

# Flat entity lookup -- all entities keyed by ARK @id
graph: Dict[str, Any] = Field(..., alias="@graph")

# Graph-level LLM outputs
executiveSummary: str = Field(..., alias="evi:executiveSummary")
narrativeSummary: str = Field(..., alias="evi:narrativeSummary")
keyFindings: Optional[List[str]] = Field(default=[], alias="evi:keyFindings")
concerns: Optional[List[str]] = Field(default=[], alias="evi:concerns")

# Quick index of all AnnotatedComputation @ids in the graph
stepAnnotations: Optional[List[IdentifierValue]] = Field(default=[], alias="evi:stepAnnotations")

# Provenance of the graph-level analysis
llmModel: str = Field(alias="evi:llmModel")
llmTemperature: Optional[float] = Field(default=None, alias="evi:llmTemperature")
dateCreated: str
interpreterVersion: Optional[str] = Field(default=None, alias="evi:interpreterVersion")

@model_validator(mode='after')
def populate_prov_fields(self):
"""Auto-populate PROV-O fields."""
# prov:wasDerivedFrom -> the original evidence graph
self.wasDerivedFrom = [self.annotates]

# prov:wasAttributedTo -> the LLM
self.wasAttributedTo = [self.llmModel]

return self
1 change: 1 addition & 0 deletions fairscape_models/computation.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ class Computation(Activity):
usedSoftware: Optional[List[IdentifierValue]] = Field(default=[])
usedMLModel: Optional[List[IdentifierValue]] = Field(default=[])
usedDataset: Optional[List[IdentifierValue]] = Field(default=[])
annotatedBy: Optional[List[IdentifierValue]] = Field(default=[], alias="evi:annotatedBy")

@model_validator(mode='after')
def populate_prov_fields(self):
Expand Down
2 changes: 1 addition & 1 deletion fairscape_models/conversion/mapping/AIReady.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,7 @@ def _score_pre_model(pre_model: PreModelExplainabilityScore, root_data: Dict[str

if "Dataset" in entity_type or "Software" in entity_type or "ROCrate" in entity_type:
total += 1
if entity.get("md5") or entity.get("MD5"):
if entity.get("md5") or entity.get("MD5" or "") or entity.get("sha256") or entity.get("SHA256" or "") or entity.get("hash") or entity.get("hash" or ""):
with_checksum += 1

if total > 0 and with_checksum > 0:
Expand Down
20 changes: 17 additions & 3 deletions fairscape_models/conversion/mapping/FairscapeDatasheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,20 @@ def _parser(prop_list: Any) -> Optional[str]:
return default
return _parser

def _bool_to_yes_no(value: Any) -> str:
"""Convert a boolean to 'Yes'/'No' string for display."""
if isinstance(value, bool):
return "Yes" if value else "No"
return str(value)

def _irb_passthrough(value: Any) -> Any:
"""Pass IRB value through as-is — string or dict (structured IRB)."""
if isinstance(value, dict):
return value
if isinstance(value, str):
return value
return None

def _extract_id(value: Any) -> Optional[str]:
if isinstance(value, dict):
return value.get("@id")
Expand Down Expand Up @@ -80,9 +94,9 @@ def _extract_id(value: Any) -> Optional[str]:
"human_subject": {"source_key": "humanSubjects", "fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("Human Subject")},
"human_subject_research": {"source_key": "humanSubjectResearch", "fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("Human Subject Research", "")},
"human_subject_exemptions": {"source_key": "humanSubjectExemption", "fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("Human Subjects Exemptions", "")},
"deidentified_samples": {"source_key": "deidentified", "fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("De-identified Samples", "")},
"fda_regulated": {"source_key": "fdaRegulated", "fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("FDA Regulated", "")},
"irb": {"source_key": "irb", "fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("IRB", "")},
"deidentified_samples": {"source_key": "deidentified", "parser": _bool_to_yes_no, "fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("De-identified Samples", "")},
"fda_regulated": {"source_key": "fdaRegulated", "parser": _bool_to_yes_no, "fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("FDA Regulated", "")},
"irb": {"source_key": "irb", "parser": _irb_passthrough, "fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("IRB", "")},
"irb_protocol_id": {"source_key": "irbProtocolId", "fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("IRB Protocol ID", "")},
"data_governance": {"source_key": "dataGovernanceCommittee","fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("Data Governance Committee")},
"completeness": {"source_key": "completeness", "fallback_source_key": "additionalProperty", "fallback_parser": from_additional_property("Completeness")},
Expand Down
8 changes: 4 additions & 4 deletions fairscape_models/conversion/models/FairscapeDatasheet.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from typing import Any, Dict, List, Optional, Union, Callable, Tuple
from pydantic import BaseModel, Field, ConfigDict
from pydantic import BaseModel, Field, ConfigDict, model_validator

##########################################################################
# --- Main Document Subsections ------------------------------------------
Expand Down Expand Up @@ -40,10 +40,10 @@ class OverviewSection(BaseModel):
human_subject: Optional[str] = None
human_subject_research: Optional[str] = None
human_subject_exemptions: Optional[str] = None
deidentified_samples: Optional[str] = None
fda_regulated: Optional[str] = None
deidentified_samples: Optional[Union[str, bool]] = None
fda_regulated: Optional[Union[str, bool]] = None
confidentiality_level: Optional[str] = None
irb: Optional[str] = None
irb: Optional[Union[str, Dict[str, Any]]] = None
irb_protocol_id: Optional[str] = None

ethical_review: Optional[str] = None
Expand Down
3 changes: 3 additions & 0 deletions fairscape_models/digital_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ class DigitalObject(BaseModel):
contentUrl: Optional[Union[str, List[str]]] = Field(default=None)
isPartOf: Optional[List[IdentifierValue]] = Field(default=[])
usedByComputation: Optional[List[IdentifierValue]] = Field(default=[])
md5: Optional[str] = Field(default=None, description="MD5 checksum of the digital object content")
hash: Optional[str] = Field(default=None, description="Hash of the digital object content (if not MD5)")
sha256: Optional[str] = Field(default=None, description="SHA-256 checksum of the digital object content")
fairscapeVersion: str = __version__

# PROV-O fields (auto-populated)
Expand Down
8 changes: 8 additions & 0 deletions fairscape_models/fairscape_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
MLMODEL_TYPE = "MLModel"
COMPUTATION_TYPE = "Computation"
ANNOTATION_TYPE = "Annotation"
ANNOTATED_COMPUTATION_TYPE = "AnnotatedComputation"
ANNOTATED_EVIDENCE_GRAPH_TYPE = "AnnotatedEvidenceGraph"
ROCRATE_TYPE = "ROCrate"

# TODO get from config
Expand Down Expand Up @@ -51,6 +53,10 @@
"@id": "https://w3id.org/EVI#generated",
"@type": "@id"
},
"annotates": {
"@id": "https://w3id.org/EVI#annotates",
"@type": "@id"
},
"hasDistribution": {
"@id": "https://w3id.org/EVI#hasDistribution",
"@type": "@id"
Expand All @@ -65,6 +71,8 @@ class ClassType(str, Enum):
ANNOTATION = 'Annotation'
SCHEMA = 'Schema'
EVIDENCE_GRAPH = 'EvidenceGraph'
ANNOTATED_COMPUTATION = 'AnnotatedComputation'
ANNOTATED_EVIDENCE_GRAPH = 'AnnotatedEvidenceGraph'
ROCRATE = 'ROCrate' #TODO: Add ROCrate concept to EVI ontology and publish a new version

def normalize_class_type(value: Union[str, ClassType]) -> ClassType:
Expand Down
41 changes: 38 additions & 3 deletions fairscape_models/rocrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,35 @@
from fairscape_models.digital_object import DigitalObject
from fairscape_models._version import __version__

class ContactPoint(BaseModel):
"""Schema.org ContactPoint for structured contact information."""
metadataType: str = Field(default="ContactPoint", alias="@type")
contactType: Optional[str] = Field(default=None)
email: Optional[str] = Field(default=None)
telephone: Optional[str] = Field(default=None)
model_config = ConfigDict(extra="allow", populate_by_name=True)


class PostalAddress(BaseModel):
"""Schema.org PostalAddress for structured address information."""
metadataType: str = Field(default="PostalAddress", alias="@type")
streetAddress: Optional[str] = Field(default=None)
addressLocality: Optional[str] = Field(default=None)
addressRegion: Optional[str] = Field(default=None)
postalCode: Optional[str] = Field(default=None)
addressCountry: Optional[str] = Field(default=None)
model_config = ConfigDict(extra="allow", populate_by_name=True)


class IRB(BaseModel):
"""Institutional Review Board with structured contact and address info."""
metadataType: str = Field(default="IRB", alias="@type")
name: str
contactPoint: Optional[ContactPoint] = Field(default=None)
address: Optional[PostalAddress] = Field(default=None)
model_config = ConfigDict(extra="allow", populate_by_name=True)


class GenericMetadataElem(BaseModel):
"""Generic Metadata Element of an ROCrate"""
guid: str = Field(alias="@id")
Expand Down Expand Up @@ -123,16 +152,20 @@ class ROCrateMetadataElem(BaseModel):
# Compliance / ethics
ethicalReview: Optional[str] = Field(default=None)
confidentialityLevel: Optional[str] = Field(default=None)
irb: Optional[str] = Field(default=None)
irb: Optional[Union[str, IRB]] = Field(default=None)
irbProtocolId: Optional[str] = Field(default=None)
humanSubjectExemption: Optional[str] = Field(default=None)
fdaRegulated: Optional[bool] = Field(default=None)
deidentified: Optional[bool] = Field(default=None)
humanSubjects: Optional[str] = Field(alias="humanSubjects", default=None)
humanSubjectResearch: Optional[str] = Field(default=None)
dataGovernanceCommittee: Optional[str] = Field(default=None)
completeness: Optional[str] = Field(alias="completeness", default=None)
prohibitedUses: Optional[str] = Field(alias="prohibitedUses", default=None)

# Checksums
md5: Optional[str] = Field(default=None, description="MD5 checksum of the digital object content")
hash: Optional[str] = Field(default=None, description="Hash of the digital object content (if not MD5)")
sha256: Optional[str] = Field(default=None, description="SHA-256 checksum of the digital object content")


# RAI fields
rai_data_limitations: Optional[str] = Field(alias="rai:dataLimitations", default=None)
Expand All @@ -154,6 +187,8 @@ class ROCrateMetadataElem(BaseModel):
rai_data_social_impact: Optional[str] = Field(alias="rai:dataSocialImpact", default=None)
rai_annotations_per_item: Optional[str] = Field(alias="rai:annotationsPerItem", default=None)
rai_machine_annotation_tools: Optional[List[str]] = Field(alias="rai:machineAnnotationTools", default=None)
completeness: Optional[str] = Field(alias="completeness", default=None)
prohibitedUses: Optional[str] = Field(alias="prohibitedUses", default=None)

# Aggregated metrics for AI-Ready scoring (roll-up properties from sub-crates)
evi_dataset_count: Optional[int] = Field(alias="evi:datasetCount", default=None)
Expand Down
Loading
Loading