From cddc08f95baf55a03dd4212c70ffcc4158edf57a Mon Sep 17 00:00:00 2001 From: jniestroy Date: Tue, 31 Mar 2026 17:28:54 -0400 Subject: [PATCH] pydantic thing --- pyproject.toml | 4 +- src/fairscape_cli/commands/build_commands.py | 5 ++- src/fairscape_cli/data_fetcher/GenomicData.py | 6 +-- .../rocrate/datasheet_generator.py | 38 +++++++++++++++++++ .../rocrate/summary_generator.py | 35 ++++++++++++++++- .../templates/sections/subcrates.html | 2 + src/fairscape_cli/entailments/find_outputs.py | 5 ++- src/fairscape_cli/entailments/inverse.py | 5 ++- src/fairscape_cli/models/bagit.py | 3 +- src/fairscape_cli/models/rocrate.py | 29 +++++++------- src/fairscape_cli/models/schema/tabular.py | 4 +- .../tracking/provenance_tracker.py | 5 ++- src/fairscape_cli/utils/build_utils.py | 9 +++-- src/fairscape_cli/utils/serialization.py | 31 +++++++++++++++ 14 files changed, 145 insertions(+), 36 deletions(-) create mode 100644 src/fairscape_cli/utils/serialization.py diff --git a/pyproject.toml b/pyproject.toml index 33348af..e8a0bdd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "fairscape-cli" -version = "1.1.17" +version = "1.1.18" description = "A utility for packaging objects and validating metadata for FAIRSCAPE" readme = "README.md" requires-python = ">=3.8" @@ -39,7 +39,7 @@ dependencies = [ "prettytable>=3.9.0", "jsonschema>=4.20.0", "sqids>=0.4.1", - "fairscape-models>=1.0.26", + "fairscape-models>=1.0.28", "pyyaml", "h5py", "frictionless>=5.0,<6.0", diff --git a/src/fairscape_cli/commands/build_commands.py b/src/fairscape_cli/commands/build_commands.py index 3dd0039..117202d 100644 --- a/src/fairscape_cli/commands/build_commands.py +++ b/src/fairscape_cli/commands/build_commands.py @@ -28,6 +28,7 @@ ) from fairscape_models.rocrate import ROCrateV1_2, ROCrateMetadataElem +from fairscape_cli.utils.serialization import prune_none from fairscape_models.conversion.converter import ROCToTargetConverter from fairscape_models.conversion.mapping.croissant import MAPPING_CONFIGURATION as CROISSANT_MAPPING @@ -519,7 +520,7 @@ def generate_evidence_graph( # Write the updated metadata back to the file with open(metadata_file, 'w') as f: - json.dump(metadata, f, indent=2) + json.dump(prune_none(metadata), f, indent=2) click.echo(f"Added hasEvidenceGraph reference to {ark_id} in RO-Crate metadata") except Exception as e: @@ -758,4 +759,4 @@ def validate_merkle_command(ctx, rocrate_path: pathlib.Path, release: bool): elif s != c: click.echo(f" CHANGED: {url}") - ctx.exit(1) \ No newline at end of file + ctx.exit(1) diff --git a/src/fairscape_cli/data_fetcher/GenomicData.py b/src/fairscape_cli/data_fetcher/GenomicData.py index f50d807..c151195 100644 --- a/src/fairscape_cli/data_fetcher/GenomicData.py +++ b/src/fairscape_cli/data_fetcher/GenomicData.py @@ -103,6 +103,7 @@ class Outputs(BaseModel): from fairscape_cli.data_fetcher.bioproject_fetcher import fetch_bioproject_data from fairscape_cli.models.rocrate import GenerateROCrate, AppendCrate +from fairscape_cli.utils.serialization import prune_none from fairscape_cli.models.dataset import GenerateDataset from fairscape_cli.models.experiment import GenerateExperiment from fairscape_cli.models.instrument import GenerateInstrument @@ -342,7 +343,7 @@ def to_rocrate( root_dataset_node["hasPart"].append({"@id": entity_id}) f.seek(0) - json.dump(crate_json, f, indent=2) + json.dump(prune_none(crate_json), f, indent=2) f.truncate() @@ -360,7 +361,7 @@ def to_rocrate( if updated: f.seek(0) - json.dump(crate_json_final, f, indent=2) + json.dump(prune_none(crate_json_final), f, indent=2) f.truncate() @@ -500,4 +501,3 @@ def from_json(cls, data: dict) -> 'GenomicData': experiments=Experiments(items=internal_experiments), outputs=Outputs(items=outputs_list) ) - diff --git a/src/fairscape_cli/datasheet_builder/rocrate/datasheet_generator.py b/src/fairscape_cli/datasheet_builder/rocrate/datasheet_generator.py index 5e453be..7b72862 100644 --- a/src/fairscape_cli/datasheet_builder/rocrate/datasheet_generator.py +++ b/src/fairscape_cli/datasheet_builder/rocrate/datasheet_generator.py @@ -159,6 +159,8 @@ def convert_main_sections(self) -> FairscapeDatasheet: distribution = distribution_converter.convert() subcrate_items = self._process_all_subcrates() + if not subcrate_items: + subcrate_items = self._build_single_crate_composition() composition = CompositionSection(items=subcrate_items) if subcrate_items else None return FairscapeDatasheet( @@ -226,6 +228,42 @@ def _process_all_subcrates(self) -> List[SubCrateItem]: return subcrate_items + def _build_single_crate_composition(self) -> List[SubCrateItem]: + """When no subcrates exist, treat the main crate itself as a single subcrate.""" + try: + converter = ROCToTargetConverter( + source_crate=self.main_crate, + mapping_configuration=SUBCRATE_MAPPING_CONFIGURATION, + global_index=self.global_metadata_index + ) + subcrate_item = converter.convert() + subcrate_item.published = self.published + subcrate_item.preview_url = "" + + if not subcrate_item.size and self.base_dir.exists(): + try: + dir_size = get_directory_size(str(self.base_dir)) + subcrate_item.size = format_size(dir_size) + except Exception: + subcrate_item.size = "Unknown" + + main_root = self.main_crate.metadataGraph[1].model_dump() + if not subcrate_item.doi: + subcrate_item.doi = main_root.get('identifier') + if not subcrate_item.related_publications: + pubs = main_root.get('associatedPublication', []) + if pubs: + subcrate_item.related_publications = pubs if isinstance(pubs, list) else [pubs] + + self._enhance_subcrate_item(subcrate_item, self.main_crate) + + return [subcrate_item] + except Exception as e: + print(f"Error building single crate composition: {e}") + import traceback + traceback.print_exc() + return [] + def _enhance_subcrate_item(self, subcrate_item: SubCrateItem, subcrate: ROCrateV1_2): """Add statistical summary info to subcrate item if present.""" root_dict = subcrate.metadataGraph[1].model_dump() if len(subcrate.metadataGraph) > 1 else {} diff --git a/src/fairscape_cli/datasheet_builder/rocrate/summary_generator.py b/src/fairscape_cli/datasheet_builder/rocrate/summary_generator.py index 71ebaa0..0a90c1e 100644 --- a/src/fairscape_cli/datasheet_builder/rocrate/summary_generator.py +++ b/src/fairscape_cli/datasheet_builder/rocrate/summary_generator.py @@ -11,6 +11,7 @@ from fairscape_models.rocrate import ROCrateV1_2 from fairscape_models.conversion.mapping.AIReady import score_rocrate from fairscape_models.conversion.models.AIReady import AIReadyScore +from fairscape_cli.utils.serialization import model_dump_pruned @dataclass @@ -89,7 +90,7 @@ def extract_summary_data(self, crate: ROCrateV1_2) -> SummaryData: formats = [] formats = [f for f in formats if f and f != "unknown"] - return SummaryData( + summary = SummaryData( name=root_data.get("name", "Unnamed Dataset"), description=root_data.get("description", ""), total_size_formatted=size_str, @@ -100,6 +101,36 @@ def extract_summary_data(self, crate: ROCrateV1_2) -> SummaryData: formats=formats ) + # Fallback: compute from graph if evi:* fields are absent (single crate case) + if summary.total_entities == 0: + formats_set = set() + for item in crate.metadataGraph: + if item.guid == "ro-crate-metadata.json": + continue + item_dict = item.model_dump(by_alias=True) + item_type = item_dict.get("@type", "") + type_str = " ".join(item_type) if isinstance(item_type, list) else str(item_type) + + if "ROCrate" in type_str or "CreativeWork" in type_str: + continue + + summary.total_entities += 1 + + if "Dataset" in type_str: + summary.dataset_count += 1 + fmt = item_dict.get("fileFormat") + if fmt and fmt != "unknown": + formats_set.add(fmt) + elif "Software" in type_str or "SoftwareSourceCode" in type_str: + summary.software_count += 1 + elif "Computation" in type_str: + summary.computation_count += 1 + + if not summary.formats and formats_set: + summary.formats = sorted(formats_set) + + return summary + @staticmethod def _format_size(size_bytes: int) -> str: """Format bytes to human-readable size.""" @@ -160,7 +191,7 @@ def compute_aiready_score(self, crate: ROCrateV1_2) -> Tuple[AIReadyScoreData, A def save_aiready_score(self, raw_score: AIReadyScore, output_path: Path) -> None: """Save the AI-Ready score to a JSON file.""" - score_dict = raw_score.model_dump() + score_dict = model_dump_pruned(raw_score) with open(output_path, 'w') as f: json.dump(score_dict, f, indent=2) diff --git a/src/fairscape_cli/datasheet_builder/templates/sections/subcrates.html b/src/fairscape_cli/datasheet_builder/templates/sections/subcrates.html index 9700810..ccf4fde 100644 --- a/src/fairscape_cli/datasheet_builder/templates/sections/subcrates.html +++ b/src/fairscape_cli/datasheet_builder/templates/sections/subcrates.html @@ -297,9 +297,11 @@

Content Summary

+ {% if subcrate.preview_url %} + {% endif %} {% endfor %} {% else %}

No subcrates found.

diff --git a/src/fairscape_cli/entailments/find_outputs.py b/src/fairscape_cli/entailments/find_outputs.py index 903c679..f64e4c4 100644 --- a/src/fairscape_cli/entailments/find_outputs.py +++ b/src/fairscape_cli/entailments/find_outputs.py @@ -1,6 +1,7 @@ import pathlib import json from typing import List, Dict, Tuple, Set, Any +from fairscape_cli.utils.serialization import prune_none def extract_datasets_from_graph(graph: List[Dict]) -> List[Tuple[str, bool]]: """ @@ -169,7 +170,7 @@ def add_inputs_outputs_to_rocrate(rocrate_path: pathlib.Path) -> Tuple[bool, str metadata["@graph"] = graph with open(metadata_path, 'w') as f: - json.dump(metadata, f, indent=2) + json.dump(prune_none(metadata), f, indent=2) input_count = len(inputs) output_count = len(outputs) @@ -179,4 +180,4 @@ def add_inputs_outputs_to_rocrate(rocrate_path: pathlib.Path) -> Tuple[bool, str except json.JSONDecodeError as e: return False, f"Error parsing JSON: {e}" except Exception as e: - return False, f"Unexpected error: {e}" \ No newline at end of file + return False, f"Unexpected error: {e}" diff --git a/src/fairscape_cli/entailments/inverse.py b/src/fairscape_cli/entailments/inverse.py index 72960b4..9e23a5c 100644 --- a/src/fairscape_cli/entailments/inverse.py +++ b/src/fairscape_cli/entailments/inverse.py @@ -1,5 +1,6 @@ import pathlib import json +from fairscape_cli.utils.serialization import prune_none from typing import List, Tuple, Dict, Any from rdflib import Graph, URIRef from rdflib.namespace import OWL @@ -198,7 +199,7 @@ def augment_rocrate_with_inverses( if modified_count > 0: try: with open(metadata_file_path, 'w') as f: - json.dump(json_data, f, indent=2, ensure_ascii=False) + json.dump(prune_none(json_data), f, indent=2, ensure_ascii=False) print(f"RO-Crate '{metadata_file_path}' augmented with inverse properties. {modified_count} modifications made.") except Exception as e: print(f"Error saving augmented RO-Crate JSON to {metadata_file_path}: {e}") @@ -206,4 +207,4 @@ def augment_rocrate_with_inverses( else: print(f"No inverse properties needed to be added or RO-Crate '{metadata_file_path}' is already consistent.") - return True \ No newline at end of file + return True diff --git a/src/fairscape_cli/models/bagit.py b/src/fairscape_cli/models/bagit.py index 34b75c1..b7cf829 100644 --- a/src/fairscape_cli/models/bagit.py +++ b/src/fairscape_cli/models/bagit.py @@ -10,6 +10,7 @@ from typing import ( Optional ) +from fairscape_cli.utils.serialization import model_dump_pruned class BagIt(BaseModel): @@ -94,7 +95,7 @@ def create_bagit_metadata(self): bagit_info_path = self.bagit_path / 'bag-info.txt' with bagit_info_path.open(mode="w") as bag_info_file: - for key, value in self.model_dump(by_alias=True).items(): + for key, value in model_dump_pruned(self, by_alias=True).items(): if key != 'bagit_path' and key != 'rocrate_path': bag_info_file.write('%s: %s\n' % (key, value)) diff --git a/src/fairscape_cli/models/rocrate.py b/src/fairscape_cli/models/rocrate.py index fae0d41..18acc02 100644 --- a/src/fairscape_cli/models/rocrate.py +++ b/src/fairscape_cli/models/rocrate.py @@ -22,6 +22,7 @@ from fairscape_cli.config import NAAN, DEFAULT_CONTEXT from fairscape_cli.models.guid_utils import GenerateDatetimeSquid, clean_guid from fairscape_models.rocrate import ROCrateV1_2, ROCrateMetadataElem, ROCrateMetadataFileElem +from fairscape_cli.utils.serialization import prune_none, model_dump_pruned def GenerateROCrate( path: pathlib.Path, @@ -80,7 +81,7 @@ def GenerateROCrate( ]} ) - rocrate_dict = rocrate.model_dump(by_alias=True, exclude_none=True) + rocrate_dict = model_dump_pruned(rocrate, by_alias=True) if 'ro-crate-metadata.json' in str(path): roCrateMetadataPath = path @@ -94,7 +95,7 @@ def GenerateROCrate( with roCrateMetadataPath.open(mode="w") as metadataFile: json.dump(rocrate_dict, metadataFile, indent=2) - return root_dataset.model_dump(by_alias=True, exclude_none=True) + return model_dump_pruned(root_dataset, by_alias=True) class ROCrate(ROCrateMetadataElem): model_config = ConfigDict(populate_by_name=True) @@ -213,7 +214,7 @@ def create_subcrate( f.seek(0) f.truncate() - json.dump(rocrate.model_dump(by_alias=True), f, indent=2) + json.dump(model_dump_pruned(rocrate, by_alias=True), f, indent=2) return subcrate['@id'] @@ -269,7 +270,7 @@ def initCrate(self): # Write to file with ro_crate_metadata_path.open(mode="w") as metadata_file: - json.dump(rocrate_metadata, metadata_file, indent=2) + json.dump(prune_none(rocrate_metadata), metadata_file, indent=2) def registerObject(self, model: Union[Dataset, Software, Computation]): """Add metadata to the graph of an ROCrate""" @@ -279,7 +280,7 @@ def registerObject(self, model: Union[Dataset, Software, Computation]): rocrate_metadata = json.load(rocrate_metadata_file) # Add to the @graph - model_data = model.model_dump(by_alias=True, exclude_none=True) + model_data = model_dump_pruned(model, by_alias=True) rocrate_metadata['@graph'].append(model_data) # Add reference to root dataset's hasPart @@ -294,7 +295,7 @@ def registerObject(self, model: Union[Dataset, Software, Computation]): # Write back to file rocrate_metadata_file.seek(0) rocrate_metadata_file.truncate() - json.dump(rocrate_metadata, rocrate_metadata_file, indent=2) + json.dump(prune_none(rocrate_metadata), rocrate_metadata_file, indent=2) def registerDataset(self, dataset: Dataset): self.registerObject(dataset) @@ -336,7 +337,7 @@ def AppendCrate( root_dataset['hasPart'] = [] for element in elements: - element_data = element.model_dump(by_alias=True, exclude_none=True) + element_data = model_dump_pruned(element, by_alias=True) rocrate_metadata['@graph'].append(element_data) root_dataset['hasPart'].append({"@id": element_data["@id"]}) @@ -346,7 +347,7 @@ def AppendCrate( # Write back to file rocrate_metadata_file.seek(0) rocrate_metadata_file.truncate() - json.dump(rocrate_metadata, rocrate_metadata_file, indent=2) + json.dump(prune_none(rocrate_metadata), rocrate_metadata_file, indent=2) def CopyToROCrate(source_filepath: str, destination_filepath: str): @@ -385,7 +386,7 @@ def UpdateCrate( rocrate_metadata = json.load(rocrate_metadata_file) # Find and replace the element with matching @id - element_data = element.model_dump(by_alias=True, exclude_none=True) + element_data = model_dump_pruned(element, by_alias=True) for i, existing in enumerate(rocrate_metadata['@graph']): if existing.get('@id') == element_data['@id']: rocrate_metadata['@graph'][i] = element_data @@ -397,7 +398,7 @@ def UpdateCrate( # Write back to file rocrate_metadata_file.seek(0) rocrate_metadata_file.truncate() - json.dump(rocrate_metadata, rocrate_metadata_file, indent=2) + json.dump(prune_none(rocrate_metadata), rocrate_metadata_file, indent=2) def LinkSubcrates(parent_crate_path: pathlib.Path) -> List[str]: parent_metadata_file = parent_crate_path / 'ro-crate-metadata.json' @@ -508,7 +509,7 @@ def find_and_process_subcrates(directory: pathlib.Path, base_path: pathlib.Path) if modified: subcrate_metadata['@graph'][subcrate_root_index] = subcrate_root with subcrate_metadata_file.open('w') as f: - json.dump(subcrate_metadata, f, indent=2) + json.dump(prune_none(subcrate_metadata), f, indent=2) reference_dict = dict(subcrate_root) relative_path = (subcrate_metadata_file.relative_to(base_path)).as_posix() @@ -535,7 +536,7 @@ def find_and_process_subcrates(directory: pathlib.Path, base_path: pathlib.Path) parent_root_dataset['hasPart'].append({'@id': sub_id}) with parent_metadata_file.open('w') as f: - json.dump(parent_metadata, f, indent=2) + json.dump(prune_none(parent_metadata), f, indent=2) else: print("No valid sub-crates found to link.") @@ -881,11 +882,11 @@ def UpdateEntitiesInGraph( return False, f"RO-Crate became invalid after update operations. Details: {e}" with metadata_filepath.open(mode="w") as metadataFile: - json.dump(validated_crate.model_dump(by_alias=True), metadataFile, indent=2, ensure_ascii=False) + json.dump(model_dump_pruned(validated_crate, by_alias=True), metadataFile, indent=2, ensure_ascii=False) return True, f"Successfully processed entities. Matched: {matched_count}, Modified: {modified_count}." except Exception as e: import traceback print(f"DEBUG: Unexpected error in UpdateEntitiesInGraph: {traceback.format_exc()}") - return False, f"An unexpected error occurred: {type(e).__name__} - {e}" \ No newline at end of file + return False, f"An unexpected error occurred: {type(e).__name__} - {e}" diff --git a/src/fairscape_cli/models/schema/tabular.py b/src/fairscape_cli/models/schema/tabular.py index a902dd0..f34d5f4 100644 --- a/src/fairscape_cli/models/schema/tabular.py +++ b/src/fairscape_cli/models/schema/tabular.py @@ -33,6 +33,7 @@ DEFAULT_SCHEMA_TYPE, NAAN, ) +from fairscape_cli.utils.serialization import model_dump_pruned class FileType(str, Enum): CSV = "csv" @@ -514,7 +515,7 @@ def from_dict(cls, data: dict) -> 'HDF5ValidationSchema': def write_schema(schema: TabularValidationSchema, output_file: str): """Write a schema to a file""" - schema_dict = schema.to_dict() + schema_dict = model_dump_pruned(schema, by_alias=True) with open(output_file, 'w') as f: json.dump(schema_dict, f, indent=2) @@ -579,4 +580,3 @@ def ReadSchemaLocal(schemaFile: str) -> TabularValidationSchema: # load the model into tabularSchema = TabularValidationSchema.model_validate(schemaJson) return tabularSchema - diff --git a/src/fairscape_cli/tracking/provenance_tracker.py b/src/fairscape_cli/tracking/provenance_tracker.py index 590a406..067e75a 100644 --- a/src/fairscape_cli/tracking/provenance_tracker.py +++ b/src/fairscape_cli/tracking/provenance_tracker.py @@ -17,6 +17,7 @@ from .utils import collect_dataset_samples, format_samples_for_prompt from fairscape_cli.models.rocrate import GenerateROCrate +from fairscape_cli.utils.serialization import prune_none from datetime import datetime @@ -89,7 +90,7 @@ def _clear_graph(self): root_dataset['hasPart'] = [] with metadata_path.open('w') as f: - json.dump(crate_data, f, indent=2) + json.dump(prune_none(crate_data), f, indent=2) print(f"Cleared existing @graph entries from {metadata_path}") @@ -448,4 +449,4 @@ def track_execution( output_count=len(output_datasets), reused_count=reused_count, new_datasets=len(new_datasets) - ) \ No newline at end of file + ) diff --git a/src/fairscape_cli/utils/build_utils.py b/src/fairscape_cli/utils/build_utils.py index 7798715..a509042 100644 --- a/src/fairscape_cli/utils/build_utils.py +++ b/src/fairscape_cli/utils/build_utils.py @@ -3,6 +3,7 @@ from pathlib import Path from typing import Optional, List, Dict, Any, Tuple import click +from fairscape_cli.utils.serialization import prune_none def find_subcrates(release_directory: Path) -> List[Path]: subcrates = [] @@ -141,7 +142,7 @@ def process_evidence_graph(subcrate_path: Path, release_directory: Optional[Path } with open(metadata_file, 'w') as f: - json.dump(metadata, f, indent=2) + json.dump(prune_none(metadata), f, indent=2) return True @@ -266,7 +267,7 @@ def process_merkle_tree(crate_path: Path) -> bool: if len(graph) > 1: graph[1]['evi:merkleRootHash'] = tree['rootHash'] with open(metadata_file, 'w') as f: - json.dump(metadata, f, indent=2) + json.dump(prune_none(metadata), f, indent=2) return True except Exception as e: @@ -297,7 +298,7 @@ def process_release_merkle_tree(release_directory: Path) -> bool: if len(graph) > 1: graph[1]['evi:merkleRootHash'] = tree['rootHash'] with open(metadata_file, 'w') as f: - json.dump(metadata, f, indent=2) + json.dump(prune_none(metadata), f, indent=2) return True except Exception as e: @@ -463,4 +464,4 @@ def process_all_subcrates(release_directory: Path, published: bool = False) -> D for error in results['errors']: click.echo(f" - {error}") - return results \ No newline at end of file + return results diff --git a/src/fairscape_cli/utils/serialization.py b/src/fairscape_cli/utils/serialization.py new file mode 100644 index 0000000..341fba4 --- /dev/null +++ b/src/fairscape_cli/utils/serialization.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +from typing import Any + +from pydantic import BaseModel + + +def prune_none(value: Any) -> Any: + """Recursively remove None values while preserving defaults and empty containers.""" + if isinstance(value, BaseModel): + value = value.model_dump(by_alias=True) + + if isinstance(value, dict): + return { + key: prune_none(item) + for key, item in value.items() + if item is not None + } + + if isinstance(value, list): + return [prune_none(item) for item in value if item is not None] + + if isinstance(value, tuple): + return [prune_none(item) for item in value if item is not None] + + return value + + +def model_dump_pruned(model: BaseModel, **kwargs: Any) -> dict[str, Any]: + """Dump a Pydantic model and recursively remove None values.""" + return prune_none(model.model_dump(**kwargs))