"
+ )
+
+
+# ---------------------------------------------------------------------------
+# Schema rendering
+# ---------------------------------------------------------------------------
+
+
+def _render_schema(schema: dict | list) -> str:
+ """Render schema as a collapsible HTML block."""
+ if not schema:
+ return ""
+
+ if isinstance(schema, dict):
+ # Tabular-style {col: dtype} or structural {"variables": [...], ...}
+ rows = ""
+ for k, v in schema.items():
+ rows += f"
{_esc(k)}
{_esc(v)}
"
+ table = (
+ f'
'
+ f"
Field
Type / Value
"
+ f"{rows}"
+ f"
"
+ )
+ n = len(schema)
+ open_attr = "open" if n <= 8 else ""
+ return (
+ f''
+ f'Schema ({n} {"field" if n == 1 else "fields"})'
+ f"{table}"
+ )
+
+ if isinstance(schema, list):
+ # List-of-dicts (frictionless style) or plain list
+ if schema and isinstance(schema[0], dict):
+ # Render each dict as a row; use union of all keys as columns
+ all_keys: list[str] = []
+ for item in schema:
+ for k in item:
+ if k not in all_keys:
+ all_keys.append(k)
+ header_row = "".join(f"
{_esc(k)}
" for k in all_keys)
+ body_rows = ""
+ for item in schema:
+ cells = "".join(f"
{_esc(item.get(k, ''))}
" for k in all_keys)
+ body_rows += f"
{cells}
"
+ table = (
+ f'
'
+ f"
{header_row}
{body_rows}
"
+ )
+ else:
+ items_html = "".join(f"
{_esc(s)}
" for s in schema)
+ table = f"
{items_html}
"
+
+ n = len(schema)
+ open_attr = "open" if n <= 8 else ""
+ return (
+ f''
+ f'Schema ({n} {"field" if n == 1 else "fields"})'
+ f"{table}"
+ )
+
+ return ""
+
+
+# ---------------------------------------------------------------------------
+# Preview builders — one function per modality family, all return HTML str
+# or None when no loader is available.
+# ---------------------------------------------------------------------------
+
+#: How many rows to show in tabular previews.
+_PREVIEW_ROWS = 5
+
+
+def _obj_to_preview_html(obj) -> str:
+ """Return the richest HTML string available for *obj*.
+
+ Tries ``_repr_html_()`` first (pandas DataFrame, polars DataFrame, xarray
+ Dataset, …), then falls back to ``__repr__``. The result is always
+ wrapped in a ``
`` so callers can rely on valid HTML.
+ """
+ if hasattr(obj, "_repr_html_"):
+ try:
+ h = obj._repr_html_()
+ if h:
+ return f'
{h}
'
+ except Exception:
+ pass
+ return f'
{_esc(repr(obj))}
'
+
+
+def _build_preview(dr: "DataResource") -> str | None:
+ """Return an HTML preview fragment, or None if not possible."""
+ fmt = dr.format
+ modality = dr.modality
+ sample = dr.sample_path if dr.sample_path else None
+
+ if sample is None:
+ return None
+
+ if modality == "tabular":
+ return _preview_tabular(dr, sample)
+ if modality == "image":
+ return _preview_image(dr, sample)
+ if modality == "array":
+ return _preview_array(dr, sample)
+ if modality == "timeseries" and fmt in ("wav", "flac", "mp3", "ogg"):
+ return _preview_audio(dr, sample)
+ return None
+
+
+# --- tabular ---
+
+
+def _preview_tabular(dr: "DataResource", path: str) -> str | None:
+ fmt = dr.format
+ fs = dr.proj.fs
+
+ try:
+ if fmt == "parquet":
+ return _preview_parquet(fs, path)
+ if fmt == "csv":
+ return _preview_csv(fs, path)
+ if fmt in ("tsv", "psv"):
+ sep = "\t" if fmt == "tsv" else "|"
+ return _preview_csv(fs, path, sep=sep)
+ if fmt == "arrow":
+ return _preview_arrow(fs, path)
+ if fmt == "jsonlines":
+ return _preview_jsonlines(fs, path)
+ if fmt == "excel":
+ return _preview_excel(fs, path)
+ if fmt in ("sqlite", "duckdb"):
+ return _preview_sql(fs, path, fmt)
+ if fmt == "orc":
+ return _preview_orc(fs, path)
+ except Exception:
+ pass
+ return None
+
+
+def _preview_parquet(fs, path: str) -> str | None:
+ """Read only the first row group (or N rows from it) — no full file scan."""
+ try:
+ import pyarrow.parquet as pq
+
+ with fs.open(path, "rb") as fh:
+ pf = pq.ParquetFile(fh)
+ # read_row_group reads one row group's pages, not the whole file
+ batch = pf.read_row_group(0)
+ if batch.num_rows > _PREVIEW_ROWS:
+ batch = batch.slice(0, _PREVIEW_ROWS)
+ # Convert to pandas so we get _repr_html_() for free
+ df = batch.to_pandas()
+ return _obj_to_preview_html(df)
+ except ImportError:
+ pass
+ try:
+ # polars can read a row-count-limited slice without decoding the rest
+ import polars as pl
+
+ with fs.open(path, "rb") as fh:
+ df = pl.read_parquet(fh, n_rows=_PREVIEW_ROWS)
+ return _obj_to_preview_html(df)
+ except ImportError:
+ pass
+ return None
+
+
+def _preview_csv(fs, path: str, sep: str = ",") -> str | None:
+ # pandas nrows= stops parsing after N data lines — minimal I/O
+ try:
+ import pandas as pd
+
+ with fs.open(path, "r", encoding="utf-8", errors="replace") as fh:
+ df = pd.read_csv(fh, sep=sep, nrows=_PREVIEW_ROWS)
+ return _obj_to_preview_html(df)
+ except ImportError:
+ pass
+ try:
+ import polars as pl
+
+ with fs.open(path, "rb") as fh:
+ df = pl.read_csv(fh, n_rows=_PREVIEW_ROWS, separator=sep)
+ return _obj_to_preview_html(df)
+ except ImportError:
+ pass
+ return None
+
+
+def _preview_arrow(fs, path: str) -> str | None:
+ """Read only the first record batch — no full file deserialisation."""
+ try:
+ import pyarrow.ipc as ipc
+
+ with fs.open(path, "rb") as fh:
+ try:
+ # IPC file format: random-access; read just batch 0
+ reader = ipc.open_file(fh)
+ batch = reader.get_batch(0)
+ except Exception:
+ fh.seek(0)
+ # IPC stream format: sequential; read just the first batch
+ reader = ipc.open_stream(fh)
+ batch = reader.read_next_batch()
+ if batch.num_rows > _PREVIEW_ROWS:
+ batch = batch.slice(0, _PREVIEW_ROWS)
+ df = batch.to_pandas()
+ return _obj_to_preview_html(df)
+ except ImportError:
+ pass
+ return None
+
+
+def _preview_jsonlines(fs, path: str) -> str | None:
+ # pandas nrows= stops reading after N lines
+ try:
+ import pandas as pd
+
+ with fs.open(path, "r", encoding="utf-8", errors="replace") as fh:
+ df = pd.read_json(fh, lines=True, nrows=_PREVIEW_ROWS)
+ return _obj_to_preview_html(df)
+ except ImportError:
+ pass
+ return None
+
+
+def _preview_excel(fs, path: str) -> str | None:
+ # nrows= limits rows read from the sheet
+ try:
+ import pandas as pd
+
+ with fs.open(path, "rb") as fh:
+ df = pd.read_excel(fh, nrows=_PREVIEW_ROWS)
+ return _obj_to_preview_html(df)
+ except ImportError:
+ pass
+ return None
+
+
+def _preview_sql(fs, path: str, fmt: str) -> str | None:
+ # SQLite/DuckDB: only works with a local path (not a remote FS)
+ try:
+ if getattr(fs, "protocol", "file") not in ("file", "local", ""):
+ return None
+ if fmt == "duckdb":
+ try:
+ import duckdb
+
+ con = duckdb.connect(path, read_only=True)
+ tables = con.execute("SHOW TABLES").fetchall()
+ if not tables:
+ return None
+ tname = tables[0][0]
+ df = con.execute(
+ f'SELECT * FROM "{tname}" LIMIT {_PREVIEW_ROWS}'
+ ).fetchdf()
+ return _obj_to_preview_html(df)
+ except ImportError:
+ pass
+ else:
+ import sqlite3
+ import pandas as pd
+
+ con = sqlite3.connect(path)
+ cur = con.cursor()
+ cur.execute("SELECT name FROM sqlite_master WHERE type='table'")
+ tables = cur.fetchall()
+ if not tables:
+ return None
+ tname = tables[0][0]
+ df = pd.read_sql(f'SELECT * FROM "{tname}" LIMIT {_PREVIEW_ROWS}', con)
+ return _obj_to_preview_html(df)
+ except Exception:
+ pass
+ return None
+
+
+def _preview_orc(fs, path: str) -> str | None:
+ try:
+ import pyarrow.orc as orc
+
+ with fs.open(path, "rb") as fh:
+ table = orc.ORCFile(fh).read().slice(0, _PREVIEW_ROWS)
+ df = table.to_pandas()
+ return _obj_to_preview_html(df)
+ except ImportError:
+ pass
+ return None
+
+
+# --- image ---
+
+
+def _preview_image(dr: "DataResource", path: str) -> str | None:
+ try:
+ from PIL import Image
+ import io
+
+ fs = dr.proj.fs
+ with fs.open(path, "rb") as fh:
+ raw: bytes = fh.read()
+
+ img = Image.open(io.BytesIO(raw))
+ img.thumbnail((600, 200))
+
+ buf = io.BytesIO()
+ # Save as PNG for lossless display regardless of source format
+ rgb = img.convert("RGB") if img.mode not in ("RGB", "L", "RGBA") else img
+ rgb.save(buf, format="PNG")
+ b64 = base64.b64encode(buf.getvalue()).decode("ascii")
+
+ w, h = img.size
+ schema = dr.schema if isinstance(dr.schema, dict) else {}
+ info = f"{schema.get('width', w)}×{schema.get('height', h)}"
+ if "mode" in schema:
+ info += f", mode={schema['mode']}"
+
+ return (
+ f'
'
+
+
+def _preview_numpy(fs, path: str) -> str | None:
+ """Read only the .npy header to get shape/dtype, then load a minimal slice."""
+ try:
+ import numpy as np
+ import numpy.lib.format as nf
+ import io
+
+ with fs.open(path, "rb") as fh:
+ raw_header = fh.read(512) # header is always ≤ 512 bytes
+
+ buf = io.BytesIO(raw_header)
+ nf.read_magic(buf)
+ # read_array_header_1_0 is the stable API across numpy versions;
+ # newer numpy also exposes read_array_header — try both.
+ try:
+ shape, _, dtype = nf.read_array_header_1_0(buf)
+ except AttributeError:
+ shape, _, dtype = nf.read_array_header(buf) # type: ignore[attr-defined]
+
+ info: dict = {"shape": str(shape), "dtype": str(dtype)}
+
+ # Load the full array only when it's small enough (≤ 1 MB heuristic)
+ # or when we can cheaply slice the first N rows.
+ try:
+ total_elements = 1
+ for s in shape:
+ total_elements *= s
+ item_size = np.dtype(dtype).itemsize
+ if total_elements * item_size <= 1_048_576:
+ with fs.open(path, "rb") as fh:
+ arr = np.load(io.BytesIO(fh.read()), allow_pickle=False)
+ sliced = arr[:_PREVIEW_ROWS] if arr.ndim >= 1 else arr
+ info["preview"] = repr(sliced)
+ except Exception:
+ pass
+
+ return _array_info_html(info)
+ except Exception:
+ pass
+ return None
+
+
+def _preview_hdf5(fs, path: str) -> str | None:
+ """Open the HDF5 file and read only metadata — no array data loaded."""
+ try:
+ import h5py
+
+ with fs.open(path, "rb") as fh:
+ with h5py.File(fh, "r") as f:
+ keys = list(f.keys())[:8]
+ info: dict = {"top-level keys": ", ".join(keys) or "(none)"}
+ for k in keys[:3]:
+ obj = f[k]
+ if hasattr(obj, "shape"):
+ info[k] = f"shape={obj.shape}, dtype={obj.dtype}"
+ else:
+ info[k] = f"group ({len(obj)} members)"
+ return _array_info_html(info)
+ except ImportError:
+ pass
+ return None
+
+
+def _preview_netcdf(fs, path: str) -> str | None:
+ """Open the dataset lazily (no data loaded) and render its repr."""
+ try:
+ import xarray as xr
+
+ with fs.open(path, "rb") as fh:
+ # engine="scipy" reads lazily; no array data is decoded here
+ ds = xr.open_dataset(fh, engine="scipy")
+ # xarray Dataset has a rich _repr_html_()
+ return _obj_to_preview_html(ds)
+ except ImportError:
+ pass
+ return None
+
+
+def _preview_zarr(dr: "DataResource") -> str | None:
+ """Use the schema cached at parse time — zero extra I/O."""
+ schema = dr.schema
+ if not schema or not isinstance(schema, dict):
+ return None
+ info = {}
+ if "arrays" in schema:
+ info["arrays"] = ", ".join(str(a) for a in schema["arrays"][:8]) or "(none)"
+ if "groups" in schema:
+ info["groups"] = ", ".join(str(g) for g in schema["groups"][:8]) or "(none)"
+ if "attrs" in schema:
+ info["attrs"] = str(dict(list(schema["attrs"].items())[:4]))
+ return _array_info_html(info) if info else None
+
+
+# --- audio ---
+
+
+def _preview_audio(dr: "DataResource", path: str) -> str | None:
+ """Read only the audio file header — no sample data loaded."""
+ try:
+ import soundfile as sf
+
+ fs = dr.proj.fs
+ with fs.open(path, "rb") as fh:
+ info = sf.info(fh)
+ details = {
+ "sample rate": f"{info.samplerate} Hz",
+ "channels": str(info.channels),
+ "duration": f"{info.frames / info.samplerate:.2f} s",
+ "format": info.format,
+ "subtype": info.subtype,
+ }
+ return _array_info_html(details)
+ except ImportError:
+ pass
+ return None
diff --git a/src/projspec/content/environment.py b/src/projspec/content/environment.py
index 1b727a4..e3fe674 100644
--- a/src/projspec/content/environment.py
+++ b/src/projspec/content/environment.py
@@ -79,11 +79,14 @@ def match(self) -> bool:
def parse(self) -> None:
import yaml
+ from projspec.artifact.python_env import CondaEnv
- u = self.proj.basenames.get(
- "environment.yaml", self.proj.basenames.get("environment.yml")
+ u = (
+ "environment.yaml"
+ if "environment.yaml" in self.proj.basenames
+ else "environment.yml"
)
- deps = yaml.safe_load(self.proj.fs.open(u, "rt"))
+ deps = yaml.safe_load(self.proj.get_file(u, text=True))
# TODO: split out pip deps
self.contents["environment"] = Environment(
stack=Stack.CONDA,
@@ -92,3 +95,6 @@ def parse(self) -> None:
channels=deps.get("channels"),
proj=self.proj,
)
+ self.artifacts["conda_env"] = CondaEnv(
+ proj=self.proj, fn=u, cmd=["conda", "env", "create", "-f", u]
+ )
diff --git a/src/projspec/proj/__init__.py b/src/projspec/proj/__init__.py
index b52d2da..929fb17 100644
--- a/src/projspec/proj/__init__.py
+++ b/src/projspec/proj/__init__.py
@@ -7,6 +7,7 @@
from projspec.proj.briefcase import Briefcase
from projspec.proj.conda_package import CondaRecipe, RattlerRecipe
from projspec.proj.conda_project import CondaProject
+from projspec.proj.data_dir import Data
from projspec.proj.datapackage import DataPackage, DVCRepo
from projspec.proj.documentation import RTD, MDBook
from projspec.proj.git import GitRepo
@@ -36,6 +37,7 @@
"Zenodo",
"CondaRecipe",
"CondaProject",
+ "Data",
"Golang",
"GitRepo",
"HelmChart",
diff --git a/src/projspec/proj/base.py b/src/projspec/proj/base.py
index 2d26c26..d519885 100644
--- a/src/projspec/proj/base.py
+++ b/src/projspec/proj/base.py
@@ -287,7 +287,7 @@ def pyproject(self):
def all_artifacts(self, names: str | None = None) -> list:
"""A flat list of all the artifact objects nested in this project."""
- arts = list(self.artifacts.values())
+ arts = list()
for spec in self.specs.values():
arts.extend(flatten(spec.artifacts))
for child in self.children.values():
diff --git a/src/projspec/proj/data_dir.py b/src/projspec/proj/data_dir.py
new file mode 100644
index 0000000..405fa60
--- /dev/null
+++ b/src/projspec/proj/data_dir.py
@@ -0,0 +1,757 @@
+"""ProjectSpec for bare data directories.
+
+Matches directories whose contents are predominantly data files (by extension or
+by a recognised on-disk layout such as Hive partitioning, Apache Iceberg, Delta
+Lake, or Zarr), with no requirement for any declarative metadata file.
+"""
+
+from __future__ import annotations
+
+import os
+import re
+from posixpath import basename as _basename
+
+from projspec.proj import ProjectSpec, ParseFailed
+from projspec.utils import AttrDict
+
+# ---------------------------------------------------------------------------
+# Extension → (canonical format name, modality)
+#
+# Modality vocabulary from intake's `structure` tags + napari's layer types:
+# "tabular" — row/column data
+# "array" — N-dimensional arrays
+# "image" — 2-D/3-D images (raster)
+# "timeseries" — time-indexed signals
+# "geospatial" — vector or raster geodata
+# "model" — ML model weights / configs
+# "nested" — hierarchical / JSON-like
+# "document" — human-readable documents
+# "video" — video streams
+# "archive" — compressed bundles
+#
+# .json is excluded — too common in non-data contexts (configs, manifests).
+# ---------------------------------------------------------------------------
+_EXT_TO_FORMAT: dict[str, tuple[str, str]] = {
+ # Tabular / columnar -------------------------------------------------------
+ ".csv": ("csv", "tabular"),
+ ".tsv": ("tsv", "tabular"),
+ ".psv": ("psv", "tabular"),
+ ".parquet": ("parquet", "tabular"),
+ ".parq": ("parquet", "tabular"),
+ ".pq": ("parquet", "tabular"),
+ ".arrow": ("arrow", "tabular"),
+ ".ipc": ("arrow", "tabular"),
+ ".feather": ("arrow", "tabular"), # Feather v1/v2 (magic: FEA1 / ARROW1)
+ ".orc": ("orc", "tabular"),
+ ".avro": ("avro", "tabular"),
+ ".xls": ("excel", "tabular"),
+ ".xlsx": ("excel", "tabular"),
+ ".xlsm": ("excel", "tabular"),
+ ".xlsb": ("excel", "tabular"),
+ ".jsonl": ("jsonlines", "tabular"),
+ ".ndjson": ("jsonlines", "tabular"),
+ ".db": ("sqlite", "tabular"), # DuckDB / SQLite (disambiguated by magic)
+ ".sqlite": ("sqlite", "tabular"),
+ ".sqlitedb": ("sqlite", "tabular"),
+ ".duckdb": ("duckdb", "tabular"),
+ # Array / scientific -------------------------------------------------------
+ ".npy": ("numpy", "array"),
+ ".npz": ("numpy", "array"),
+ ".hdf5": ("hdf5", "array"),
+ ".hdf": ("hdf5", "array"),
+ ".h5": ("hdf5", "array"),
+ ".h4": ("hdf5", "array"),
+ ".he5": ("hdf5", "array"),
+ ".nc": ("netcdf", "array"),
+ ".nc3": ("netcdf", "array"),
+ ".nc4": ("netcdf", "array"),
+ ".mat": ("matlab", "array"),
+ ".fits": ("fits", "array"),
+ ".grib": ("grib", "timeseries"),
+ ".grb": ("grib", "timeseries"),
+ ".grib2": ("grib", "timeseries"),
+ ".grb2": ("grib", "timeseries"),
+ ".asdf": ("asdf", "array"),
+ ".zarr": ("zarr", "array"),
+ # Image / biomedical imaging -----------------------------------------------
+ ".png": ("png", "image"),
+ ".jpg": ("jpeg", "image"),
+ ".jpeg": ("jpeg", "image"),
+ ".tif": ("tiff", "image"), # also geotiff — ambiguous; image wins
+ ".tiff": ("tiff", "image"),
+ ".cog": ("tiff", "geospatial"), # Cloud-Optimised GeoTIFF
+ ".bmp": ("bmp", "image"),
+ ".gif": ("gif", "image"),
+ ".webp": ("webp", "image"),
+ ".dcm": ("dicom", "image"),
+ ".dicom": ("dicom", "image"),
+ ".nii": ("nifti", "image"),
+ ".nrrd": ("nrrd", "image"),
+ ".nhdr": ("nrrd", "image"),
+ ".mha": ("metaimage", "image"),
+ ".mhd": ("metaimage", "image"),
+ ".svs": ("svs", "image"), # Aperio whole-slide image
+ ".ndpi": ("ndpi", "image"), # Hamamatsu whole-slide image
+ ".scn": ("scn", "image"), # Leica whole-slide image
+ ".lsm": ("lsm", "image"), # Zeiss confocal
+ ".exr": ("exr", "image"), # OpenEXR HDR
+ ".qptiff": ("qptiff", "image"), # PerkinElmer whole-slide
+ # Geospatial ---------------------------------------------------------------
+ ".shp": ("shapefile", "geospatial"),
+ ".shx": ("shapefile", "geospatial"),
+ ".dbf": ("shapefile", "geospatial"),
+ ".geojson": ("geojson", "geospatial"),
+ ".gpkg": ("geopackage", "geospatial"),
+ ".fgb": ("flatgeobuf", "geospatial"),
+ ".kml": ("kml", "geospatial"),
+ ".pmtiles": ("pmtiles", "geospatial"),
+ # Audio --------------------------------------------------------------------
+ ".wav": ("wav", "timeseries"),
+ ".flac": ("flac", "timeseries"),
+ ".mp3": ("mp3", "timeseries"),
+ ".ogg": ("ogg", "timeseries"),
+ # Video --------------------------------------------------------------------
+ ".mp4": ("mp4", "video"),
+ ".avi": ("avi", "video"),
+ ".mov": ("mov", "video"),
+ ".mkv": ("mkv", "video"),
+ ".webm": ("webm", "video"),
+ # ML model weights ---------------------------------------------------------
+ ".safetensors": ("safetensors", "model"),
+ ".gguf": ("gguf", "model"),
+ ".pt": ("pytorch", "model"),
+ ".pth": ("pytorch", "model"),
+ ".onnx": ("onnx", "model"),
+ ".tfrec": ("tfrecord", "model"),
+ # Archive / bundle ---------------------------------------------------------
+ ".pkl": ("pickle", "archive"),
+ ".bin": ("binary", "archive"),
+}
+
+_DATA_EXTENSIONS: frozenset[str] = frozenset(_EXT_TO_FORMAT)
+
+# ---------------------------------------------------------------------------
+# Magic-byte signatures (format, modality, offset, bytes_pattern).
+#
+# Each entry: (format_str, modality_str, offset, pattern)
+# offset = int → match at that fixed byte offset
+# offset = None → scan anywhere in the first 1 KiB (re.search)
+#
+# Ordered from most-specific to least-specific (longer / more-offset patterns
+# first so they shadow shorter ones that match the same header).
+# ---------------------------------------------------------------------------
+_MAGIC: list[tuple[str, str, int | None, bytes]] = [
+ # Fixed-offset signatures
+ ("dicom", "image", 128, b"DICM"), # DICOM preamble
+ ("nifti", "image", 344, b"ni1\x00"), # NIfTI-1
+ ("nifti", "image", 344, b"n+1\x00"), # NIfTI-1 single file
+ ("duckdb", "tabular", 8, b"DUCK"),
+ ("safetensors", "model", 8, b"{"), # SafeTensors JSON header
+ ("wav", "timeseries", 8, b"WAVE"), # RIFF…WAVE
+ # Offset-0 signatures
+ ("parquet", "tabular", 0, b"PAR1"),
+ ("hdf5", "array", 0, b"\x89HDF"),
+ ("netcdf", "array", 0, b"CDF\x01"), # NetCDF classic
+ ("netcdf", "array", 0, b"CDF\x02"), # NetCDF-64bit
+ ("orc", "tabular", 0, b"ORC"),
+ ("avro", "tabular", 0, b"Obj\x01"),
+ ("arrow", "tabular", 0, b"ARROW1"), # IPC stream
+ ("arrow", "tabular", 0, b"FEA1"), # Feather v1
+ ("numpy", "array", 0, b"\x93NUMPY"),
+ ("matlab", "array", 0, b"MATLAB"),
+ ("fits", "array", 0, b"SIMPLE"),
+ ("grib", "timeseries", 0, b"GRIB"),
+ ("asdf", "array", 0, b"#ASDF"),
+ ("flatgeobuf", "geospatial", 0, b"fgb"),
+ ("gguf", "model", 0, b"GGUF"),
+ ("png", "image", 0, b"\x89PNG"),
+ ("jpeg", "image", 0, b"\xff\xd8\xff"),
+ ("tiff", "image", 0, b"II*\x00"), # little-endian TIFF
+ ("tiff", "image", 0, b"MM\x00*"), # big-endian TIFF
+ ("sqlite", "tabular", 0, b"SQLite format"),
+ ("shapefile", "geospatial", 0, b"\x00\x00\x27\x0a"),
+ ("pmtiles", "geospatial", 0, b"PMTiles"),
+]
+
+# Regex that matches Hive-style partition directory names (e.g. "year=2024").
+_HIVE_DIR_RE = re.compile(r"^[^=]+=.+$")
+
+
+# ---------------------------------------------------------------------------
+# Schema extraction helpers — all imports inside try/except ImportError so
+# that missing optional libraries never block parsing.
+# ---------------------------------------------------------------------------
+
+
+def _read_schema(path: str, fmt: str, fs) -> dict | list:
+ """Return a best-effort schema dict/list for *path*, or {} on any failure."""
+ try:
+ if fmt == "parquet":
+ try:
+ import pyarrow.parquet as pq
+
+ with fs.open(path, "rb") as fh:
+ pf = pq.ParquetFile(fh)
+ return {field.name: str(field.type) for field in pf.schema_arrow}
+ except ImportError:
+ pass
+
+ elif fmt == "arrow":
+ try:
+ import pyarrow.ipc as ipc
+
+ with fs.open(path, "rb") as fh:
+ reader = ipc.open_file(fh)
+ return {field.name: str(field.type) for field in reader.schema}
+ except ImportError:
+ pass
+
+ elif fmt == "hdf5":
+ try:
+ import h5py
+
+ with fs.open(path, "rb") as fh:
+ with h5py.File(fh, "r") as ds:
+ return {
+ "variables": list(ds.keys()),
+ "attrs": dict(ds.attrs),
+ }
+ except ImportError:
+ pass
+
+ elif fmt == "netcdf":
+ try:
+ import netCDF4 as nc # type: ignore[import]
+
+ with fs.open(path, "rb") as fh:
+ ds = nc.Dataset("in-mem", memory=fh.read())
+ return {
+ "variables": list(ds.variables.keys()),
+ "dims": {k: len(v) for k, v in ds.dimensions.items()},
+ }
+ except ImportError:
+ try:
+ import xarray as xr # type: ignore[import]
+
+ with fs.open(path, "rb") as fh:
+ ds = xr.open_dataset(fh, engine="scipy")
+ return {
+ "variables": list(ds.data_vars),
+ "dims": dict(ds.dims),
+ }
+ except ImportError:
+ pass
+
+ elif fmt in ("jpeg", "png", "bmp", "gif", "webp", "tiff"):
+ try:
+ from PIL import Image # type: ignore[import]
+
+ with fs.open(path, "rb") as fh:
+ img = Image.open(fh)
+ img.load()
+ mode = img.mode
+ channels = len(img.getbands())
+ return {
+ "width": img.width,
+ "height": img.height,
+ "channels": channels,
+ "mode": mode,
+ }
+ except ImportError:
+ pass
+
+ elif fmt in ("wav", "flac", "mp3", "ogg"):
+ try:
+ import soundfile as sf # type: ignore[import]
+
+ with fs.open(path, "rb") as fh:
+ info = sf.info(fh)
+ return {
+ "sample_rate": info.samplerate,
+ "channels": info.channels,
+ "frames": info.frames,
+ }
+ except ImportError:
+ pass
+
+ except Exception: # — never let schema extraction abort parsing
+ pass
+
+ return {}
+
+
+# ---------------------------------------------------------------------------
+# Helpers that work on the already-loaded filelist / basenames
+# ---------------------------------------------------------------------------
+
+
+def _filelist_dirs(filelist: list[dict]) -> list[dict]:
+ """Return only directory entries from a filelist."""
+ return [e for e in filelist if e.get("type", "") == "directory"]
+
+
+def _filelist_files(filelist: list[dict]) -> list[dict]:
+ """Return only file entries from a filelist."""
+ return [e for e in filelist if e.get("type", "") != "directory"]
+
+
+def _fmt_from_path(path: str) -> tuple[str, str] | None:
+ """Return (format, modality) for *path* by extension, or None if unknown."""
+ ext = os.path.splitext(path)[1].lower()
+ return _EXT_TO_FORMAT.get(ext)
+
+
+def _identify_by_magic(path: str, fs) -> tuple[str, str] | None:
+ """Return (format, modality) by probing *path*'s header bytes, or None.
+
+ Reads up to 1 KiB. Checks fixed-offset patterns first (longer offsets
+ first, to avoid short patterns shadowing longer ones), then scans for
+ anywhere-patterns via re.search.
+ """
+ try:
+ with fs.open(path, "rb") as fh:
+ head = fh.read(1024)
+ except Exception:
+ return None
+
+ for fmt, modality, offset, pattern in _MAGIC:
+ if offset is None:
+ if re.search(re.escape(pattern), head):
+ return fmt, modality
+ else:
+ if head[offset : offset + len(pattern)] == pattern:
+ return fmt, modality
+ return None
+
+
+# Token that may vary across files in a series: digits, dashes, underscores, dots.
+# Alphabetic variation (e.g. "users" vs "orders") disqualifies collation.
+_SERIES_VAR_RE = re.compile(r"^[\d\-_.]+$")
+
+
+def _common_affix(stems: list[str]) -> tuple[str, str]:
+ """Return the longest (prefix, suffix) shared by every stem in *stems*."""
+ if not stems:
+ return "", ""
+ prefix = os.path.commonprefix(stems)
+ # Reverse each stem to find common suffix via commonprefix trick
+ rev = [s[::-1] for s in stems]
+ suffix = os.path.commonprefix(rev)[::-1]
+ # Ensure prefix and suffix don't overlap (can happen with a single-char stem)
+ if len(prefix) + len(suffix) > min(len(s) for s in stems):
+ suffix = ""
+ return prefix, suffix
+
+
+def _group_by_naming_series(entries: list[dict]) -> list[list[dict]]:
+ """Partition *entries* (same-format file list) into naming-series groups.
+
+ Two or more files belong to the same series when their basenames (stems)
+ differ only in a contiguous segment that consists solely of digits, dashes,
+ underscores, or dots — i.e. a numeric counter or a date component.
+
+ A single file is always its own series (trivially consistent).
+
+ Returns a list of groups, each group being a non-empty list of entries that
+ share a common naming pattern.
+ """
+ if len(entries) <= 1:
+ return [entries] if entries else []
+
+ # Compute stems once
+ stems = [os.path.splitext(_basename(e["name"]))[0] for e in entries]
+
+ prefix, suffix = _common_affix(stems)
+ plen, slen = len(prefix), len(suffix)
+
+ # Extract the variable middle segment for each stem
+ variables = []
+ for stem in stems:
+ mid = stem[plen : len(stem) - slen if slen else len(stem)]
+ variables.append(mid)
+
+ # All files form one series if:
+ # 1. There is a non-trivial shared prefix OR suffix (at least 1 char), AND
+ # 2. Every variable segment is numeric/date-like (no alphabetic chars)
+ has_affix = plen >= 1 or slen >= 1
+ all_numeric_var = all(_SERIES_VAR_RE.match(v) or v == "" for v in variables)
+
+ if has_affix and all_numeric_var:
+ return [entries]
+
+ # Otherwise fall back: each file is its own "series" (separate resource)
+ return [[e] for e in entries]
+
+
+# ---------------------------------------------------------------------------
+# Data spec
+# ---------------------------------------------------------------------------
+
+# Sentinel files / directories whose presence indicates a non-data project
+# type is also present in this directory. When any of these are found,
+# Data.parse() applies the byte-majority test instead of parsing
+# unconditionally.
+#
+# Notably absent: datapackage.json, catalog.yaml/yml, .dvc/ — those belong
+# to projspec.proj.datapackage and are treated as compatible companions.
+_NON_DATA_SENTINELS: frozenset[str] = frozenset(
+ {
+ # Python
+ "pyproject.toml",
+ "setup.py",
+ "setup.cfg",
+ "hatch.toml",
+ # Rust
+ "Cargo.toml",
+ # JavaScript / Node
+ "package.json",
+ # Go
+ "go.mod",
+ # Container / infra
+ "Dockerfile",
+ "docker-compose.yml",
+ "docker-compose.yaml",
+ # Helm
+ "Chart.yaml",
+ # Ruby / Java / .NET
+ "Gemfile",
+ "pom.xml",
+ "build.gradle",
+ "*.csproj",
+ # R
+ "DESCRIPTION",
+ # Conda
+ "environment.yml",
+ "environment.yaml",
+ "meta.yaml",
+ # Pixi
+ "pixi.toml",
+ # Mkdocs / Sphinx / RTD
+ "mkdocs.yml",
+ "mkdocs.yaml",
+ "conf.py",
+ ".readthedocs.yaml",
+ ".readthedocs.yml",
+ # Scripts / notebooks that imply code-first dirs
+ "Makefile",
+ }
+)
+
+
+class Data(ProjectSpec):
+ """A directory whose primary contents are data files.
+
+ Matches on any of:
+ - At least one file with an unambiguous data extension (CSV, Parquet, Arrow,
+ HDF5, images, audio, etc.) — without requiring a metadata sidecar.
+ - A recognised directory layout: Hive partitioning (``key=value/`` subdirs),
+ Apache Iceberg (``metadata/`` directory), Delta Lake (``_delta_log/``), or
+ a Zarr store (``.zattrs`` / ``.zgroup`` at the root).
+
+ Parsing behaviour
+ -----------------
+ If no non-datapackage project signals are present in the directory the spec
+ parses unconditionally. If sentinel files that indicate another project type
+ (``pyproject.toml``, ``Cargo.toml``, ``package.json``, …) are found, parsing
+ succeeds only when the majority of bytes in the root file listing belong to
+ recognised data files; otherwise ``ParseFailed`` is raised so that the
+ directory is not double-counted as both a code project and a data project.
+ """
+
+ spec_doc = "https://opencode.ai/docs" # placeholder — no single upstream spec
+
+ # ------------------------------------------------------------------
+ # match()
+ # ------------------------------------------------------------------
+
+ def match(self) -> bool:
+ # Fast path: structural layout signals (no file-content inspection needed)
+ if self._detect_layout():
+ return True
+ # Slow path: any top-level file with an unambiguous data extension
+ return any(
+ os.path.splitext(name)[1].lower() in _DATA_EXTENSIONS
+ for name in self.proj.basenames
+ )
+
+ # ------------------------------------------------------------------
+ # parse()
+ # ------------------------------------------------------------------
+
+ def parse(self) -> None:
+ from projspec.content.data import (
+ DataResource,
+ ) # local import keeps startup fast
+
+ # If non-datapackage project sentinels are present, only keep this
+ # spec when data files account for the majority of bytes at the root.
+ if self._has_non_data_sentinels():
+ if not self._data_bytes_majority():
+ raise ParseFailed(
+ "Non-data project sentinels found and data files are not "
+ "the majority of bytes — skipping Data spec"
+ )
+
+ layout = self._detect_layout()
+ resources: list
+
+ if layout in ("hive", "iceberg", "delta"):
+ resources = self._parse_layout_dirs(layout)
+ # Delta/Iceberg also commonly store data files at the root level
+ # alongside the log/metadata directory; collect those too.
+ if layout in ("iceberg", "delta"):
+ root_resources = self._parse_flat()
+ resources = resources + root_resources
+ elif layout in ("zarr_store", "tiledarray"):
+ resources = [self._parse_zarr_root()]
+ else:
+ resources = self._parse_flat()
+
+ if not resources:
+ raise ParseFailed("No recognisable data files found")
+
+ if len(resources) == 1:
+ self._contents["data_resource"] = resources[0]
+ else:
+ self._contents["data_resource"] = AttrDict(
+ {_safe_key(r.path): r for r in resources}
+ )
+
+ # ------------------------------------------------------------------
+ # Sentinel / byte-majority helpers
+ # ------------------------------------------------------------------
+
+ def _has_non_data_sentinels(self) -> bool:
+ """Return True if any non-datapackage project sentinel is present."""
+ basenames = self.proj.basenames
+ return any(name in _NON_DATA_SENTINELS for name in basenames)
+
+ def _data_bytes_majority(self) -> bool:
+ """Return True if data files account for >50 % of root-listing bytes.
+
+ Files with unknown / zero size are excluded from both totals so they
+ do not unfairly skew the ratio.
+ """
+ total_bytes = 0
+ data_bytes = 0
+ for entry in self.proj.filelist:
+ size = entry.get("size") or 0
+ if size <= 0:
+ continue
+ total_bytes += size
+ ext = os.path.splitext(entry["name"].rsplit("/", 1)[-1])[1].lower()
+ if ext in _DATA_EXTENSIONS:
+ data_bytes += size
+ if total_bytes == 0:
+ return False
+ return data_bytes > total_bytes / 2
+
+ # ------------------------------------------------------------------
+ # Layout detection
+ # ------------------------------------------------------------------
+
+ def _detect_layout(self) -> str:
+ """Return a layout string, or '' if none of the known layouts match.
+
+ Uses the `contains` sentinel approach from intake: certain well-known
+ files/directories at the root identify a directory as a logical dataset.
+ """
+ basenames = self.proj.basenames
+ # Zarr store: .zattrs, .zgroup, or zarr.json at the root
+ # (zarr.json is the Zarr v3 sentinel; .zattrs/.zgroup are v2)
+ if any(s in basenames for s in (".zattrs", ".zgroup", "zarr.json")):
+ return "zarr_store"
+ dir_names = {_basename(e["name"]) for e in _filelist_dirs(self.proj.filelist)}
+ # Delta Lake
+ if "_delta_log" in dir_names:
+ return "delta"
+ # TileDB array directory
+ if "__meta" in dir_names and "__schema" in dir_names:
+ return "tiledarray"
+ # Apache Iceberg: metadata/ directory present
+ if "metadata" in dir_names:
+ return "iceberg"
+ # Partitioned Parquet: _metadata sentinel file at root (written by Spark/Dask)
+ if "_metadata" in basenames:
+ return "iceberg"
+ # Hive: any top-level subdirectory whose name matches key=value
+ if any(_HIVE_DIR_RE.match(d) for d in dir_names):
+ return "hive"
+ return ""
+
+ # ------------------------------------------------------------------
+ # Parsing helpers
+ # ------------------------------------------------------------------
+
+ def _resource_from_entries(
+ self, entries: list[dict], fmt: str, modality: str, layout: str
+ ):
+ """Build a DataResource from a list of same-format file entries.
+
+ The ``path`` field is set to:
+
+ - Single file: the bare basename, e.g. ``"data.csv"``.
+ - Multi-file series: a glob pattern, e.g. ``"part*.csv"``, built from
+ the shared prefix/suffix of the basenames.
+ """
+ from projspec.content.data import DataResource
+
+ full_paths = [e["name"] for e in entries]
+ total_size = sum(e.get("size", 0) or 0 for e in entries)
+ sample_path = full_paths[0] if full_paths else ""
+ schema = _read_schema(sample_path, fmt, self.proj.fs) if sample_path else {}
+
+ ext = os.path.splitext(_basename(full_paths[0]))[1] if full_paths else ""
+
+ if len(entries) == 1:
+ path = _basename(full_paths[0]) or fmt
+ else:
+ stems = [os.path.splitext(_basename(p))[0] for p in full_paths]
+ prefix, suffix = _common_affix(stems)
+ stem_pattern = (prefix.rstrip("-_.") or fmt) + "*" + suffix
+ path = stem_pattern + ext
+
+ return DataResource(
+ proj=self.proj,
+ path=path,
+ format=fmt,
+ modality=modality,
+ layout=layout,
+ file_count=len(entries),
+ total_size=total_size,
+ schema=schema,
+ sample_path=sample_path,
+ )
+
+ def _parse_flat(self) -> list:
+ """Group top-level files by format and naming series.
+
+ Files of the same format are only collated into a single DataResource
+ when they share a consistent naming schema — i.e. their stems differ
+ only in a numeric or date-like segment (e.g. ``part0.csv``,
+ ``part1.csv`` or ``2024-02.tiff``, ``2024-03.tiff``). Files whose
+ stems vary in alphabetic content (e.g. ``users.csv``, ``orders.csv``)
+ each become their own DataResource.
+ """
+ from projspec.content.data import (
+ DataResource,
+ ) # (used via _resource_from_entries)
+
+ # First bucket by (fmt, modality)
+ fmt_groups: dict[tuple[str, str], list[dict]] = {}
+ for entry in _filelist_files(self.proj.filelist):
+ fmt_info = _fmt_from_path(entry["name"])
+ if fmt_info is None:
+ continue
+ fmt_groups.setdefault(fmt_info, []).append(entry)
+
+ resources = []
+ for (fmt, modality), entries in fmt_groups.items():
+ # Split each format-group into naming series
+ for series in _group_by_naming_series(entries):
+ resources.append(
+ self._resource_from_entries(series, fmt, modality, "flat")
+ )
+ return resources
+
+ def _parse_layout_dirs(self, layout: str) -> list:
+ """One DataResource per top-level subdirectory (partition / table root).
+
+ Within each subdirectory the dominant format is determined, then files
+ are checked for a consistent naming series before collating.
+ """
+ dir_entries = _filelist_dirs(self.proj.filelist)
+ resources = []
+ for dir_entry in dir_entries:
+ dir_path = dir_entry["name"]
+ dir_name = _basename(dir_path)
+ # Skip hidden/internal dirs for iceberg/delta
+ if layout in ("iceberg", "delta") and dir_name.startswith(
+ ("metadata", "_delta_log", "_")
+ ):
+ continue
+ # Enumerate files one level inside this subdirectory
+ try:
+ sub_filelist = self.proj.fs.ls(dir_path, detail=True)
+ except Exception:
+ continue
+
+ sub_files = _filelist_files(sub_filelist)
+ # Determine dominant (fmt, modality) by file count
+ fmt_counts: dict[tuple[str, str], int] = {}
+ for e in sub_files:
+ fmt_info = _fmt_from_path(e["name"])
+ if fmt_info:
+ fmt_counts[fmt_info] = fmt_counts.get(fmt_info, 0) + 1
+ if not fmt_counts:
+ continue
+ dominant = max(fmt_counts, key=lambda k: fmt_counts[k])
+ dominant_fmt, dominant_modality = dominant
+ dominant_files = [
+ e for e in sub_files if _fmt_from_path(e["name"]) == dominant
+ ]
+ resource = self._resource_from_entries(
+ dominant_files, dominant_fmt, dominant_modality, layout
+ )
+ # Override path with the directory basename + trailing slash
+ # (partition dirs are already logically grouped by the directory)
+ resource.path = dir_name + "/"
+ resources.append(resource)
+ return resources
+
+ def _parse_zarr_root(self):
+ """Describe the whole directory as a single array-store resource.
+
+ Used for Zarr stores and TileDB arrays — both are directory-as-dataset
+ layouts with no individual data files at the root.
+ """
+ from projspec.content.data import DataResource
+
+ url = self.proj.url
+ layout = self._detect_layout()
+ # TileDB directories are not Zarr; distinguish the format accordingly
+ if layout == "tiledarray":
+ fmt, modality = "tiledb", "array"
+ schema: dict | list = {}
+ else:
+ fmt, modality = "zarr", "array"
+ schema = {}
+ try:
+ import zarr # type: ignore[import]
+
+ store = zarr.open(url, mode="r")
+ schema = {
+ "arrays": list(store.array_keys()),
+ "groups": list(store.group_keys()),
+ "attrs": dict(store.attrs),
+ }
+ except (ImportError, Exception):
+ pass
+
+ total_size = sum(
+ e.get("size", 0) or 0 for e in _filelist_files(self.proj.filelist)
+ )
+ return DataResource(
+ proj=self.proj,
+ path=(_basename(url) or fmt) + "/",
+ format=fmt,
+ modality=modality,
+ layout=layout,
+ file_count=len(_filelist_files(self.proj.filelist)),
+ total_size=total_size,
+ schema=schema,
+ sample_path="",
+ )
+
+
+# ---------------------------------------------------------------------------
+# Utilities
+# ---------------------------------------------------------------------------
+
+
+def _safe_key(name: str) -> str:
+ """Convert an arbitrary name to a valid Python identifier for AttrDict keys."""
+ key = re.sub(r"[^0-9a-zA-Z_]", "_", name)
+ if key and key[0].isdigit():
+ key = "_" + key
+ return key or "_unnamed"
diff --git a/tests/test_data_html.py b/tests/test_data_html.py
new file mode 100644
index 0000000..2d6e6ea
--- /dev/null
+++ b/tests/test_data_html.py
@@ -0,0 +1,449 @@
+"""Tests for projspec.content.data_html — repr_text and repr_html.
+
+These tests use a mock DataResource to avoid needing real data files on disk
+for basic formatting checks, then run format-specific loader tests when the
+required optional libraries are available.
+"""
+
+from __future__ import annotations
+
+import io
+import os
+import tempfile
+from unittest.mock import MagicMock
+
+import pytest
+
+import projspec
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _make_dr(
+ path="mytable.parquet",
+ fmt="parquet",
+ modality="tabular",
+ layout="flat",
+ file_count=3,
+ total_size=1024 * 512,
+ schema=None,
+ sample_path="",
+ metadata=None,
+):
+ """Build a DataResource backed by a real Project (the repo root) but with
+ controlled field values."""
+ from projspec.content.data import DataResource
+
+ mock_proj = MagicMock(spec=projspec.Project)
+ # Use a real local filesystem via fsspec
+ import fsspec
+
+ mock_proj.fs = fsspec.filesystem("file")
+ mock_proj.url = "/tmp"
+
+ return DataResource(
+ proj=mock_proj,
+ path=path,
+ format=fmt,
+ modality=modality,
+ layout=layout,
+ file_count=file_count,
+ total_size=total_size,
+ schema=schema or {},
+ sample_path=sample_path,
+ metadata=metadata or {},
+ )
+
+
+# ---------------------------------------------------------------------------
+# repr_text tests
+# ---------------------------------------------------------------------------
+
+
+class TestReprText:
+ def test_basic_fields_present(self):
+ dr = _make_dr()
+ text = repr(dr)
+ assert "mytable.parquet" in text
+ assert "parquet" in text
+ assert "tabular" in text
+ assert "files=3" in text
+
+ def test_size_formatting(self):
+ dr = _make_dr(total_size=1024)
+ text = repr(dr)
+ assert "KB" in text or "B" in text
+
+ def test_size_zero(self):
+ dr = _make_dr(total_size=0)
+ text = repr(dr)
+ assert "unknown" in text
+
+ def test_schema_hint_dict(self):
+ dr = _make_dr(schema={"col_a": "int64", "col_b": "float32", "col_c": "str"})
+ text = repr(dr)
+ assert "col_a" in text
+
+ def test_schema_hint_many_fields(self):
+ schema = {f"col_{i}": "int64" for i in range(10)}
+ dr = _make_dr(schema=schema)
+ text = repr(dr)
+ assert "+7 more" in text
+
+ def test_schema_hint_list(self):
+ dr = _make_dr(schema=[{"name": "a"}, {"name": "b"}])
+ text = repr(dr)
+ assert "2 fields" in text
+
+ def test_non_flat_layout_shown(self):
+ dr = _make_dr(layout="hive")
+ text = repr(dr)
+ assert "hive" in text
+
+ def test_flat_layout_hidden(self):
+ dr = _make_dr(layout="flat")
+ text = repr(dr)
+ assert "layout" not in text
+
+ def test_no_modality(self):
+ dr = _make_dr(modality="")
+ text = repr(dr)
+ assert "modality" not in text
+
+ def test_single_line(self):
+ dr = _make_dr()
+ text = repr(dr)
+ assert "\n" not in text
+
+ def test_path_shown(self):
+ """repr_text must show the path field, not a separate name."""
+ dr = _make_dr(path="part*.csv")
+ text = repr(dr)
+ assert "part*.csv" in text
+
+ def test_dir_path_shown(self):
+ dr = _make_dr(path="year=2024/")
+ text = repr(dr)
+ assert "year=2024/" in text
+
+
+# ---------------------------------------------------------------------------
+# repr_html tests
+# ---------------------------------------------------------------------------
+
+
+class TestReprHtml:
+ def test_returns_string(self):
+ dr = _make_dr()
+ html = dr._repr_html_()
+ assert isinstance(html, str)
+ assert len(html) > 0
+
+ def test_contains_path(self):
+ dr = _make_dr(path="my_dataset.parquet")
+ html = dr._repr_html_()
+ assert "my_dataset.parquet" in html
+
+ def test_contains_glob_path(self):
+ dr = _make_dr(path="part*.parquet")
+ html = dr._repr_html_()
+ assert "part*.parquet" in html
+
+ def test_contains_dir_path(self):
+ dr = _make_dr(path="year=2024/")
+ html = dr._repr_html_()
+ assert "year=2024/" in html
+
+ def test_contains_format_badge(self):
+ dr = _make_dr(fmt="parquet")
+ html = dr._repr_html_()
+ assert "parquet" in html
+
+ def test_contains_modality_badge(self):
+ dr = _make_dr(modality="tabular")
+ html = dr._repr_html_()
+ assert "tabular" in html
+
+ def test_contains_file_count(self):
+ dr = _make_dr(file_count=7)
+ html = dr._repr_html_()
+ assert "7" in html
+
+ def test_contains_size(self):
+ dr = _make_dr(total_size=2048)
+ html = dr._repr_html_()
+ assert "KB" in html or "B" in html
+
+ def test_schema_dict_rendered(self):
+ dr = _make_dr(schema={"id": "int64", "name": "string"})
+ html = dr._repr_html_()
+ assert "id" in html
+ assert "int64" in html
+
+ def test_schema_list_of_dicts_rendered(self):
+ dr = _make_dr(
+ schema=[
+ {"name": "id", "type": "integer"},
+ {"name": "val", "type": "number"},
+ ]
+ )
+ html = dr._repr_html_()
+ assert "id" in html
+ assert "integer" in html
+
+ def test_schema_empty_no_details(self):
+ dr = _make_dr(schema={})
+ html = dr._repr_html_()
+ assert "Schema" not in html
+
+ def test_no_preview_section_without_sample_path(self):
+ dr = _make_dr(sample_path="")
+ html = dr._repr_html_()
+ assert "Preview" not in html
+
+ def test_layout_badge_shown_for_hive(self):
+ dr = _make_dr(layout="hive")
+ html = dr._repr_html_()
+ assert "hive" in html
+
+ def test_layout_badge_hidden_for_flat(self):
+ dr = _make_dr(layout="flat")
+ html = dr._repr_html_()
+ assert 'ps-badge-gray">flat<' not in html
+
+ def test_html_structure(self):
+ dr = _make_dr()
+ html = dr._repr_html_()
+ assert "ps-data-card" in html
+ assert "ps-data-card-header" in html
+ assert "ps-data-meta" in html
+
+ def test_icon_present_for_known_modality(self):
+ dr = _make_dr(modality="image")
+ html = dr._repr_html_()
+ # Image icon is 🖼 (🖼)
+ assert "🖼" in html
+
+ def test_icon_fallback_for_unknown_modality(self):
+ dr = _make_dr(modality="")
+ html = dr._repr_html_()
+ # Fallback icon 🗂
+ assert "🗂" in html
+
+ def test_large_schema_collapsed(self):
+ schema = {f"col_{i}": "int64" for i in range(20)}
+ dr = _make_dr(schema=schema)
+ html = dr._repr_html_()
+ # details element should NOT have open attribute when >8 fields
+ assert (
+ "' in html
+ )
+
+ def test_small_schema_open(self):
+ schema = {f"col_{i}": "int64" for i in range(4)}
+ dr = _make_dr(schema=schema)
+ html = dr._repr_html_()
+ assert " with a dataframe class
+ assert "dataframe" in html or "ps-df-wrap" in html
+
+ def test_csv_preview_row_limit(self, tmp_path):
+ """Only _PREVIEW_ROWS rows of data should appear, not all 50."""
+ pytest.importorskip("pandas")
+ import pandas as pd
+
+ path = str(tmp_path / "big.csv")
+ pd.DataFrame({"v": range(50)}).to_csv(path, index=False)
+ dr = self._dr_for_file(path, "csv", "tabular")
+ html = dr._repr_html_()
+ # Extract just the preview section so CSS text doesn't interfere
+ preview_start = html.find('
')
+ assert preview_start != -1, "no preview section found"
+ preview_html = html[preview_start:]
+ # The last row value (49) should not appear as a table cell
+ assert "
49
" not in preview_html
+
+ def test_parquet_preview(self, tmp_path):
+ pytest.importorskip("pyarrow")
+ import pyarrow as pa
+ import pyarrow.parquet as pq
+
+ path = str(tmp_path / "data.parquet")
+ table = pa.table({"a": [1, 2, 3], "b": ["x", "y", "z"]})
+ pq.write_table(table, path)
+ dr = self._dr_for_file(path, "parquet", "tabular")
+ html = dr._repr_html_()
+ assert "Preview" in html
+ assert "
1 MB threshold
+ np.save(path, np.zeros((512, 512), dtype="float64"))
+ dr = self._dr_for_file(path, "numpy", "array")
+ html = dr._repr_html_()
+ assert "(512, 512)" in html # shape shown
+ assert "float64" in html # dtype shown
+ # The data slice key ("preview") should NOT appear in the info table;
+ # check the table cell content rather than the CSS class names
+ assert ">preview<" not in html # no
preview
row
+
+
+# ---------------------------------------------------------------------------
+# fmt_size helper
+# ---------------------------------------------------------------------------
+
+
+def test_fmt_size():
+ from projspec.content.data_html import _fmt_size
+
+ assert _fmt_size(0) == "unknown"
+ assert _fmt_size(512) == "512 B"
+ assert "KB" in _fmt_size(2048)
+ assert "MB" in _fmt_size(2 * 1024 * 1024)
+ assert "GB" in _fmt_size(3 * 1024**3)
diff --git a/tests/test_data_project.py b/tests/test_data_project.py
new file mode 100644
index 0000000..9f71ff0
--- /dev/null
+++ b/tests/test_data_project.py
@@ -0,0 +1,362 @@
+import json
+import os
+
+import pytest
+
+import projspec
+from projspec.content.data import DataResource
+from projspec.utils import from_dict
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _data_project(tmp_path):
+ """Return a projspec.Project rooted at *tmp_path* (no walk needed)."""
+ return projspec.Project(str(tmp_path))
+
+
+# ---------------------------------------------------------------------------
+# Detection tests
+# ---------------------------------------------------------------------------
+
+
+class TestDataDetection:
+ def test_csv_detected(self, tmp_path):
+ (tmp_path / "data.csv").write_text("x,y\n1,2\n3,4\n")
+ proj = _data_project(tmp_path)
+ assert "data" in proj.specs
+
+ def test_parquet_detected(self, tmp_path):
+ pytest.importorskip("pyarrow")
+ import pyarrow as pa
+ import pyarrow.parquet as pq
+
+ pq.write_table(pa.table({"a": [1, 2]}), str(tmp_path / "t.parquet"))
+ proj = _data_project(tmp_path)
+ assert "data" in proj.specs
+
+ def test_no_data_files_not_detected(self, tmp_path):
+ (tmp_path / "README.md").write_text("hello")
+ (tmp_path / "config.json").write_text("{}")
+ proj = _data_project(tmp_path)
+ assert "data" not in proj.specs
+
+
+# ---------------------------------------------------------------------------
+# Parse / DataResource field tests
+# ---------------------------------------------------------------------------
+
+
+class TestDataParse:
+ def test_single_csv_resource(self, tmp_path):
+ (tmp_path / "sales.csv").write_text("col1,col2\n1,a\n2,b\n")
+ proj = _data_project(tmp_path)
+ dr = proj.specs["data"].contents["data_resource"]
+ assert isinstance(dr, DataResource)
+ assert dr.path == "sales.csv"
+ assert dr.format == "csv"
+ assert dr.modality == "tabular"
+ assert dr.file_count == 1
+
+ def test_series_collated_to_glob_path(self, tmp_path):
+ """part0.csv + part1.csv → path == 'part*.csv'"""
+ for i in range(3):
+ (tmp_path / f"part{i}.csv").write_text("x\n1\n")
+ proj = _data_project(tmp_path)
+ dr = proj.specs["data"].contents["data_resource"]
+ assert isinstance(dr, DataResource)
+ assert dr.path == "part*.csv"
+ assert dr.file_count == 3
+
+ def test_distinct_csv_files_separate_resources(self, tmp_path):
+ """users.csv and orders.csv differ alphabetically → two resources."""
+ (tmp_path / "users.csv").write_text("id\n1\n")
+ (tmp_path / "orders.csv").write_text("id\n1\n")
+ proj = _data_project(tmp_path)
+ dr_map = proj.specs["data"].contents["data_resource"]
+ # Two separate DataResource objects, keyed in an AttrDict
+ assert len(dr_map) == 2
+ paths = {dr_map[k].path for k in dr_map}
+ assert "users.csv" in paths
+ assert "orders.csv" in paths
+
+ def test_sample_path_is_full_path(self, tmp_path):
+ csv = tmp_path / "data.csv"
+ csv.write_text("x\n1\n")
+ proj = _data_project(tmp_path)
+ dr = proj.specs["data"].contents["data_resource"]
+ assert dr.sample_path == str(csv)
+
+ def test_total_size_nonzero(self, tmp_path):
+ content = "x,y\n" + "\n".join(f"{i},{i}" for i in range(20))
+ (tmp_path / "nums.csv").write_text(content)
+ proj = _data_project(tmp_path)
+ dr = proj.specs["data"].contents["data_resource"]
+ assert dr.total_size > 0
+
+
+# ---------------------------------------------------------------------------
+# Serialisation: to_dict
+# ---------------------------------------------------------------------------
+
+
+class TestDataResourceToDict:
+ def _make_dr(self, tmp_path):
+ (tmp_path / "items.csv").write_text("id,val\n1,a\n2,b\n")
+ proj = _data_project(tmp_path)
+ return proj.specs["data"].contents["data_resource"]
+
+ def test_compact_omits_klass(self, tmp_path):
+ dr = self._make_dr(tmp_path)
+ d = dr.to_dict(compact=True)
+ assert "klass" not in d
+
+ def test_compact_omits_html(self, tmp_path):
+ """compact=True is for human/console output — _html must be absent."""
+ dr = self._make_dr(tmp_path)
+ d = dr.to_dict(compact=True)
+ assert "_html" not in d
+
+
+# ---------------------------------------------------------------------------
+# Serialisation: from_dict round-trip
+# ---------------------------------------------------------------------------
+
+
+class TestDataResourceRoundTrip:
+ def _roundtrip(self, dr):
+ """Serialise to JSON and rehydrate, returning the new DataResource."""
+ d = dr.to_dict(compact=False)
+ js = json.dumps(d)
+ d2 = json.loads(js)
+ return from_dict(d2, proj=dr.proj)
+
+ def _make_dr(self, tmp_path):
+ (tmp_path / "orders.csv").write_text("order_id,amount\n1,99\n2,42\n")
+ proj = _data_project(tmp_path)
+ return proj.specs["data"].contents["data_resource"]
+
+ def test_roundtrip_returns_dataresource(self, tmp_path):
+ dr2 = self._roundtrip(self._make_dr(tmp_path))
+ assert isinstance(dr2, DataResource)
+
+ def test_roundtrip_preserves_path(self, tmp_path):
+ dr2 = self._roundtrip(self._make_dr(tmp_path))
+ assert dr2.path == "orders.csv"
+
+ def test_roundtrip_preserves_format(self, tmp_path):
+ dr2 = self._roundtrip(self._make_dr(tmp_path))
+ assert dr2.format == "csv"
+
+ def test_roundtrip_preserves_modality(self, tmp_path):
+ dr2 = self._roundtrip(self._make_dr(tmp_path))
+ assert dr2.modality == "tabular"
+
+ def test_roundtrip_preserves_file_count(self, tmp_path):
+ dr2 = self._roundtrip(self._make_dr(tmp_path))
+ assert dr2.file_count == 1
+
+ def test_roundtrip_preserves_total_size(self, tmp_path):
+ dr = self._make_dr(tmp_path)
+ dr2 = self._roundtrip(dr)
+ assert dr2.total_size == dr.total_size
+
+ def test_roundtrip_preserves_schema(self, tmp_path):
+ pytest.importorskip("pyarrow")
+ import pyarrow as pa, pyarrow.parquet as pq
+
+ pq.write_table(
+ pa.table({"col_a": [1, 2, 3], "col_b": ["x", "y", "z"]}),
+ str(tmp_path / "data.parquet"),
+ )
+ proj = _data_project(tmp_path)
+ dr = proj.specs["data"].contents["data_resource"]
+ dr2 = self._roundtrip(dr)
+ assert dr2.schema == dr.schema
+
+ def test_roundtrip_html_matches_original(self, tmp_path):
+ """_repr_html_() on the rehydrated object must equal the original render."""
+ dr = self._make_dr(tmp_path)
+ html_original = dr._repr_html_()
+ dr2 = self._roundtrip(dr)
+ assert dr2._repr_html_() == html_original
+
+ def test_roundtrip_html_cached_without_rerender(self, tmp_path):
+ """After from_dict the HTML is already in _html — no re-render occurs."""
+ dr = self._make_dr(tmp_path)
+ html_original = dr._repr_html_()
+ d = dr.to_dict(compact=False)
+ d2 = json.loads(json.dumps(d))
+ dr2 = from_dict(d2, proj=dr.proj)
+
+ # Confirm _html is set directly on the instance (not via lazy render)
+ assert (
+ "_html" in dr2.__dict__
+ ), "_html should be in instance __dict__ after from_dict"
+ assert dr2.__dict__["_html"] == html_original
+
+ def test_roundtrip_html_survives_missing_sample_path(self, tmp_path):
+ """After rehydration, _repr_html_() must work even if sample_path
+ no longer resolves (e.g. moved to a different machine)."""
+ dr = self._make_dr(tmp_path)
+ # Trigger render with a real file, then remove the file
+ html_original = dr._repr_html_()
+ os.remove(dr.sample_path)
+
+ dr2 = self._roundtrip(dr)
+ # sample_path is gone — but HTML was cached in the dict
+ assert dr2._repr_html_() == html_original
+
+
+# ---------------------------------------------------------------------------
+# Conditional parse: sentinel / byte-majority logic
+# ---------------------------------------------------------------------------
+
+
+class TestDataConditionalParse:
+ """Tests for the 'other project types present' guard in Data.parse()."""
+
+ # -- helpers --
+
+ def _big_csv(self, path, rows=500):
+ """Write a CSV large enough to dominate byte counts."""
+ content = "id,value\n" + "\n".join(f"{i},{i * 2}" for i in range(rows))
+ path.write_text(content)
+
+ # -- pure data directories (no sentinels) --
+
+ def test_pure_data_dir_no_sentinel(self, tmp_path):
+ """No sentinel → Data always parsed regardless of byte ratios."""
+ (tmp_path / "data.csv").write_text("x\n1\n")
+ proj = _data_project(tmp_path)
+ assert "data" in proj.specs
+
+ def test_datapackage_companion_not_a_sentinel(self, tmp_path):
+ """datapackage.json is a compatible companion — not a sentinel."""
+ self._big_csv(tmp_path / "data.csv")
+ (tmp_path / "datapackage.json").write_text('{"resources": []}')
+ proj = _data_project(tmp_path)
+ assert "data" in proj.specs
+
+ def test_dvc_companion_not_a_sentinel(self, tmp_path):
+ """catalog.yaml (IntakeCatalog / DVCRepo companion) is not a sentinel."""
+ self._big_csv(tmp_path / "data.csv")
+ (tmp_path / "catalog.yaml").write_text("sources: {}")
+ proj = _data_project(tmp_path)
+ assert "data" in proj.specs
+
+ # -- mixed dirs where data dominates --
+
+ def test_sentinel_present_data_majority(self, tmp_path):
+ """Sentinel present but data files are majority of bytes → Data parsed."""
+ self._big_csv(tmp_path / "data.csv") # large data file
+ (tmp_path / "pyproject.toml").write_text(
+ "[project]\nname='x'\n"
+ ) # tiny sentinel
+ proj = _data_project(tmp_path)
+ assert "data" in proj.specs
+
+ def test_sentinel_present_data_majority_parquet(self, tmp_path):
+ pytest.importorskip("pyarrow")
+ import pyarrow as pa, pyarrow.parquet as pq
+
+ pq.write_table(
+ pa.table({"x": list(range(1000)), "y": list(range(1000))}),
+ str(tmp_path / "data.parquet"),
+ )
+ (tmp_path / "Cargo.toml").write_text('[package]\nname="x"\n')
+ proj = _data_project(tmp_path)
+ assert "data" in proj.specs
+
+ # -- mixed dirs where non-data dominates --
+
+ def test_sentinel_present_code_majority(self, tmp_path):
+ """Sentinel present and code files dominate → Data spec suppressed."""
+ # Large Python source file
+ (tmp_path / "main.py").write_text("x = 1\n" * 5000)
+ # Tiny CSV
+ (tmp_path / "tiny.csv").write_text("a,b\n1,2\n")
+ (tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n")
+ proj = _data_project(tmp_path)
+ assert "data" not in proj.specs
+
+ def test_sentinel_present_equal_split_not_majority(self, tmp_path):
+ """Exactly 50/50 bytes is not a majority — Data suppressed."""
+ payload = "x" * 1000
+ (tmp_path / "code.py").write_text(payload)
+ (tmp_path / "data.csv").write_text(payload)
+ (tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n")
+ proj = _data_project(tmp_path)
+ assert "data" not in proj.specs
+
+ # -- helpers / unit tests for the private methods --
+
+ def test_has_non_data_sentinels_true(self, tmp_path):
+ from projspec.proj.data_dir import Data
+
+ (tmp_path / "data.csv").write_text("x\n1\n")
+ (tmp_path / "pyproject.toml").write_text("")
+ proj = projspec.Project.__new__(projspec.Project)
+ import fsspec
+
+ proj.fs = fsspec.filesystem("file")
+ proj.url = str(tmp_path)
+ proj.__dict__["basenames"] = {
+ e["name"].rsplit("/", 1)[-1]: e["name"]
+ for e in proj.fs.ls(str(tmp_path), detail=True)
+ }
+ proj.__dict__["filelist"] = proj.fs.ls(str(tmp_path), detail=True)
+ inst = Data.__new__(Data)
+ inst.proj = proj
+ assert inst._has_non_data_sentinels() is True
+
+ def test_has_non_data_sentinels_false(self, tmp_path):
+ from projspec.proj.data_dir import Data
+
+ (tmp_path / "data.csv").write_text("x\n1\n")
+ proj = projspec.Project.__new__(projspec.Project)
+ import fsspec
+
+ proj.fs = fsspec.filesystem("file")
+ proj.url = str(tmp_path)
+ proj.__dict__["basenames"] = {
+ e["name"].rsplit("/", 1)[-1]: e["name"]
+ for e in proj.fs.ls(str(tmp_path), detail=True)
+ }
+ proj.__dict__["filelist"] = proj.fs.ls(str(tmp_path), detail=True)
+ inst = Data.__new__(Data)
+ inst.proj = proj
+ assert inst._has_non_data_sentinels() is False
+
+ def test_data_bytes_majority_true(self, tmp_path):
+ from projspec.proj.data_dir import Data
+
+ self._big_csv(tmp_path / "data.csv")
+ (tmp_path / "small.py").write_text("x = 1\n")
+ proj = projspec.Project.__new__(projspec.Project)
+ import fsspec
+
+ proj.fs = fsspec.filesystem("file")
+ proj.url = str(tmp_path)
+ proj.__dict__["filelist"] = proj.fs.ls(str(tmp_path), detail=True)
+ inst = Data.__new__(Data)
+ inst.proj = proj
+ assert inst._data_bytes_majority() is True
+
+ def test_data_bytes_majority_false(self, tmp_path):
+ from projspec.proj.data_dir import Data
+
+ (tmp_path / "main.py").write_text("x = 1\n" * 5000)
+ (tmp_path / "tiny.csv").write_text("a\n1\n")
+ proj = projspec.Project.__new__(projspec.Project)
+ import fsspec
+
+ proj.fs = fsspec.filesystem("file")
+ proj.url = str(tmp_path)
+ proj.__dict__["filelist"] = proj.fs.ls(str(tmp_path), detail=True)
+ inst = Data.__new__(Data)
+ inst.proj = proj
+ assert inst._data_bytes_majority() is False
diff --git a/vsextension/src/extension.ts b/vsextension/src/extension.ts
index 13ead47..12b3476 100644
--- a/vsextension/src/extension.ts
+++ b/vsextension/src/extension.ts
@@ -406,7 +406,7 @@ function getDetailsWebviewContent(projectBasename: string, projectUrl: string, p
const infoData = getInfoData();
// Keys that are internal implementation details and add no user-facing value
- const SKIP_KEYS = new Set(['klass', 'proc', 'storage_options', 'children', 'url']);
+ const SKIP_KEYS = new Set(['klass', 'proc', 'storage_options', 'children', 'url', '_html']);
// Classify what type of colour-coding a node should get based on where it sits in the tree
type NodeRole = 'spec' | 'content' | 'artifact' | 'field' | 'none';
@@ -422,6 +422,8 @@ function getDetailsWebviewContent(projectBasename: string, projectUrl: string, p
// For info popups:
infoData?: string | null;
itemType?: string;
+ // Pre-rendered HTML from a content object's _html field
+ htmlContent?: string;
}
function escapeHtml(s: string): string {
@@ -572,6 +574,9 @@ function getDetailsWebviewContent(projectBasename: string, projectUrl: string, p
children: children.length > 0 ? children : undefined,
infoData: nodeInfoData,
itemType: role !== 'none' && role !== 'field' ? role : undefined,
+ htmlContent: (role === 'content' && value && typeof value === 'object' && !Array.isArray(value) && typeof (value as any)._html === 'string')
+ ? (value as any)._html
+ : undefined,
});
}
@@ -620,6 +625,43 @@ function getDetailsWebviewContent(projectBasename: string, projectUrl: string, p
? `