diff --git a/qtapp/main.py b/qtapp/main.py index 3e4cd23..4c3fbeb 100644 --- a/qtapp/main.py +++ b/qtapp/main.py @@ -272,7 +272,9 @@ def _show_project_details(self, project_url: str, highlight_key: str = ""): if proj is None: return basename = project_url.split("/")[-1] or project_url - html = get_details_html(basename, project_url, proj.to_dict(), highlight_key) + html = get_details_html( + basename, project_url, proj.to_dict(compact=False), highlight_key + ) self.detail.setHtml(html) def _on_detail_message(self, msg: dict): @@ -335,7 +337,9 @@ def __init__(self, parent=None): def refresh(self, scroll_to: str | None = None): """Re-render the library HTML panel.""" - data = {url: proj.to_dict() for url, proj in library.entries.items()} + data = { + url: proj.to_dict(compact=False) for url, proj in library.entries.items() + } info_data = class_infos() spec_names = list(info_data.get("specs", {}).keys()) html = get_library_html(data, spec_names, scroll_to_project_url=scroll_to) diff --git a/qtapp/views.py b/qtapp/views.py index dbe4812..1b85b76 100644 --- a/qtapp/views.py +++ b/qtapp/views.py @@ -163,6 +163,16 @@ def _get_info_data() -> dict: border-radius: 50%; animation: spin 0.7s linear infinite; opacity: 0.8; } @keyframes spin { to { transform: rotate(360deg); } } + + .html-preview { + display: block; + width: 100%; + border: none; + margin-top: 4px; + margin-left: 20px; + min-height: 40px; + max-height: 600px; + } """ _INFO_POPUP_JS = """ @@ -815,7 +825,7 @@ def get_library_html( # Details panel # --------------------------------------------------------------------------- -_SKIP_KEYS = {"klass", "proc", "storage_options", "children", "url"} +_SKIP_KEYS = {"klass", "proc", "storage_options", "children", "url", "_html"} def _build_tooltip(doc, link): @@ -1002,6 +1012,13 @@ def scalar_label(v): "children": children or None, "infoData": node_info_data, "itemType": role if role not in ("none", "field") else None, + "htmlContent": ( + value["_html"] + if role == "content" + and isinstance(value, dict) + and isinstance(value.get("_html"), str) + else None + ), } ) @@ -1017,6 +1034,52 @@ def _is_leaf_artifact(node: dict) -> bool: return not any(c.get("role") == "artifact" for c in children) +# Dark console-green stylesheet injected into every html-preview srcdoc. +_HTML_PREVIEW_CSS = ( + "" +) + + def _render_detail_node(node: dict, depth: int) -> str: has_children = bool(node.get("children")) can_make = _is_leaf_artifact(node) @@ -1067,6 +1130,19 @@ def _render_detail_node(node: dict, depth: int) -> str: f'' ) + html_content = node.get("htmlContent") + if html_content: + srcdoc = ( + (_HTML_PREVIEW_CSS + html_content) + .replace("&", "&") + .replace('"', """) + .replace("<", "<") + .replace(">", ">") + ) + html_preview = f'' + else: + html_preview = "" + make_btn = ( f'' if can_make @@ -1085,6 +1161,7 @@ def _render_detail_node(node: dict, depth: int) -> str: {make_btn} {info_btn} + {html_preview} {children_html} """ diff --git a/src/projspec/__main__.py b/src/projspec/__main__.py index ad9eb04..b84fa07 100755 --- a/src/projspec/__main__.py +++ b/src/projspec/__main__.py @@ -89,16 +89,30 @@ def version(): help="List of spec types to ignore (comma-separated list in camel or snake case)", ) @click.option( - "--json-out", is_flag=True, default=False, help="JSON output, for projects only" + "--json-out", + is_flag=True, + default=False, + help="JSON output, for projects only", ) @click.option( - "--html-out", is_flag=True, default=False, help="HTML output, for projects only" + "--html-out", + is_flag=True, + default=False, + help="HTML output, for projects only", ) @click.option("--walk", is_flag=True, help="To descend into all child directories") @click.option("--summary", is_flag=True, help="Show abbreviated output") @click.option("--library", is_flag=True, help="Add to library") def scan( - path, storage_options, types, xtypes, json_out, html_out, walk, summary, library + path, + storage_options, + types, + xtypes, + json_out, + html_out, + walk, + summary, + library, ): """Scan the given path for projects, and display @@ -109,13 +123,17 @@ def scan( else: types = types.split(",") proj = projspec.Project( - path, storage_options=storage_options, types=types, xtypes=xtypes, walk=walk + path, + storage_options=storage_options, + types=types, + xtypes=xtypes, + walk=walk, ) if summary: print(proj.text_summary()) else: if json_out: - print(json.dumps(proj.to_dict(compact=True))) + print(json.dumps(proj.to_dict(compact=False))) elif html_out: print(proj._repr_html_()) else: @@ -199,14 +217,21 @@ def library(): @library.command("list") @click.option( - "--json-out", is_flag=True, default=False, help="JSON output, for projects only" + "--json-out", + is_flag=True, + default=False, + help="JSON output, for projects only", ) def list(json_out): from projspec.library import ProjectLibrary library = ProjectLibrary() if json_out: - print(json.dumps({k: v.to_dict() for k, v in library.entries.items()})) + print( + json.dumps( + {k: v.to_dict(compact=False) for k, v in library.entries.items()} + ) + ) else: for url in sorted(library.entries): proj = library.entries[url] diff --git a/src/projspec/content/data.py b/src/projspec/content/data.py index d67ff44..c7eddf7 100644 --- a/src/projspec/content/data.py +++ b/src/projspec/content/data.py @@ -1,4 +1,5 @@ """Contents specifying datasets""" + from dataclasses import dataclass, field from projspec.content import BaseContent @@ -33,3 +34,74 @@ class IntakeSource(BaseContent): # TODO: add better fields: args, driver/reader, metadata, description name: str + + +@dataclass +class DataResource(BaseContent): + """A data resource found inside a data-only directory. + + Describes one logical dataset — which may be a flat collection of files, a + Hive-partitioned tree, an Iceberg/Delta table, a Zarr store, or any other + recognised on-disk layout. + + The ``path`` field is a human-readable basename that identifies the resource: + + - Single file: ``"data.csv"`` + - Multi-file series: ``"part*.parquet"`` (glob-style, common prefix + ``*`` + ext) + - Directory-as-dataset (Hive partition, Zarr store, …): ``"year=2024/"`` + + The ``modality`` field classifies the broad nature of the data using the + vocabulary established by intake's ``structure`` tags and napari's layer + type system: + + - ``"tabular"`` — row/column data (CSV, Parquet, ORC, Excel, …) + - ``"array"`` — N-dimensional arrays (NumPy, HDF5, NetCDF, Zarr, …) + - ``"image"`` — 2-D/3-D images (PNG, JPEG, TIFF, DICOM, NIfTI, …) + - ``"timeseries"`` — time-indexed signals (WAV, GRIB, …) + - ``"geospatial"`` — vector/raster geodata (Shapefile, GeoJSON, GeoTIFF, …) + - ``"model"`` — ML model weights (GGUF, SafeTensors, PyTorch, …) + - ``"nested"`` — hierarchical / JSON-like (Avro, YAML, XML, …) + - ``"document"`` — human-readable documents (PDF, DOCX, …) + - ``"video"`` — video streams (MP4, AVI, …) + - ``"archive"`` — compressed bundles (ZIP, tar.gz, …) + - ``""`` — unknown / mixed + + The ``schema`` field is format-specific: + + - Tabular (Parquet, Arrow, CSV, …): ``{column_name: dtype_str, …}`` + - Image / array: ``{"width": int, "height": int, "channels": int, "mode": str}`` + - Audio: ``{"sample_rate": int, "channels": int, "frames": int}`` + - HDF5 / Zarr / NetCDF: ``{"variables": [...], "dims": {...}, "attrs": {...}}`` + - Unknown / library not available: ``{}`` + """ + + path: str # basename (or glob pattern / dir/ ) identifying this resource + format: str # canonical format string, e.g. "parquet", "csv", "png", "hdf5" + modality: str = "" # broad data nature; see docstring for vocabulary + layout: str = "" # "flat"|"hive"|"iceberg"|"delta"|"zarr_store"|"tiledarray"|"" + file_count: int = 0 + total_size: int = 0 # bytes; 0 when unknown (e.g. remote FS without size info) + schema: dict | list = field(default_factory=dict) + # full path to one representative file, for use by preview loaders + sample_path: str = "" + metadata: dict = field(default_factory=dict) # catch-all extras + _html = None + + def __repr__(self) -> str: + from projspec.content.data_html import repr_text + + return repr_text(self) + + def _repr_html_(self) -> str: + """Jupyter rich display — returns cached HTML, rendering on first call.""" + if self._html is None: + from projspec.content.data_html import repr_html + + self._html = repr_html(self) + return self._html + + def to_dict(self, compact=False): + d = super().to_dict(compact=compact) + if not compact: + d["_html"] = self._repr_html_() + return d diff --git a/src/projspec/content/data_html.py b/src/projspec/content/data_html.py new file mode 100644 index 0000000..530fb60 --- /dev/null +++ b/src/projspec/content/data_html.py @@ -0,0 +1,632 @@ +"""Text and HTML representations for DataResource. + +``repr_text`` — plain-text one-liner for ``__repr__``. +``repr_html`` — rich HTML card for Jupyter's ``_repr_html_`` protocol. + +The HTML card has two sections: + +1. **Metadata table** — name, format, modality, layout, file count, total size, + schema (collapsed by default when it has many entries). + +2. **Preview** (optional) — a lightweight peek at the actual data using + whichever optional library is available for the format. The section is + silently omitted when no suitable loader can be imported. + +All loader imports are guarded with ``try/except ImportError`` so that the +representation degrades gracefully when optional dependencies are absent. +""" + +from __future__ import annotations + +import base64 +import html as _html +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from projspec.content.data import DataResource + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +_MODALITY_ICON: dict[str, str] = { + "tabular": "📊", # 📊 + "image": "🖼", # 🖼 + "array": "🧮", # 🧮 + "timeseries": "📈", # 📈 + "geospatial": "🌍", # 🌍 + "model": "🧠", # 🧠 + "nested": "📂", # 📂 + "document": "📄", # 📄 + "video": "🎬", # 🎬 + "archive": "📦", # 📦 + "": "🗂", # 🗂 +} + + +def _fmt_size(n: int) -> str: + """Human-readable byte count.""" + if n <= 0: + return "unknown" + for unit in ("B", "KB", "MB", "GB", "TB"): + if n < 1024: + return f"{n:.1f} {unit}" if unit != "B" else f"{n} B" + n /= 1024 # type: ignore[assignment] + return f"{n:.1f} PB" + + +def _esc(s: object) -> str: + return _html.escape(str(s)) + + +# --------------------------------------------------------------------------- +# Plain-text repr +# --------------------------------------------------------------------------- + + +def repr_text(dr: "DataResource") -> str: + """One-line text representation of a DataResource.""" + size = _fmt_size(dr.total_size) + schema_hint = "" + if isinstance(dr.schema, dict) and dr.schema: + keys = list(dr.schema)[:3] + extra = f", +{len(dr.schema) - 3} more" if len(dr.schema) > 3 else "" + schema_hint = f" [{', '.join(str(k) for k in keys)}{extra}]" + elif isinstance(dr.schema, list) and dr.schema: + schema_hint = f" [{len(dr.schema)} fields]" + + parts = [ + f"DataResource({dr.path!r}", + f"format={dr.format!r}", + ] + if dr.modality: + parts.append(f"modality={dr.modality!r}") + if dr.layout and dr.layout not in ("flat", ""): + parts.append(f"layout={dr.layout!r}") + parts.append(f"files={dr.file_count}") + parts.append(f"size={size}") + if schema_hint: + parts.append(f"schema={schema_hint.strip()}") + return ", ".join(parts) + ")" + + +# --------------------------------------------------------------------------- +# HTML repr +# --------------------------------------------------------------------------- + +# No inline styles — class names are present for external styling by the +# host environment (Jupyter, VS Code webview, etc.). +_CARD_CSS = "" + + +def repr_html(dr: "DataResource") -> str: + """Rich HTML card representation of a DataResource.""" + icon = _MODALITY_ICON.get(dr.modality, _MODALITY_ICON[""]) + size_str = _fmt_size(dr.total_size) + + # ---- header ---- + modality_badge = ( + f'{_esc(dr.modality)}' if dr.modality else "" + ) + format_badge = f'{_esc(dr.format)}' + layout_badge = ( + f'{_esc(dr.layout)}' + if dr.layout and dr.layout not in ("flat", "") + else "" + ) + + header = ( + f'
' + f'{icon}' + f'{_esc(dr.path)}' + f"{modality_badge}{format_badge}{layout_badge}" + f"
" + ) + + # ---- metadata table ---- + meta_rows = [ + ("Files", str(dr.file_count)), + ("Total size", size_str), + ] + + meta_html_rows = "".join( + f"{_esc(k)}{v}" for k, v in meta_rows + ) + schema_html = _render_schema(dr.schema) + + meta_section = ( + f'
' + f"{meta_html_rows}
" + f"{schema_html}" + f"
" + ) + + # ---- preview ---- + preview_html = _build_preview(dr) + preview_section = "" + if preview_html: + preview_section = ( + f'
' + f'
Preview
' + f"{preview_html}" + f"
" + ) + + return ( + _CARD_CSS + + f'
' + + header + + meta_section + + preview_section + + "
" + ) + + +# --------------------------------------------------------------------------- +# Schema rendering +# --------------------------------------------------------------------------- + + +def _render_schema(schema: dict | list) -> str: + """Render schema as a collapsible HTML block.""" + if not schema: + return "" + + if isinstance(schema, dict): + # Tabular-style {col: dtype} or structural {"variables": [...], ...} + rows = "" + for k, v in schema.items(): + rows += f"{_esc(k)}{_esc(v)}" + table = ( + f'' + f"" + f"{rows}" + f"
FieldType / Value
" + ) + n = len(schema) + open_attr = "open" if n <= 8 else "" + return ( + f'
' + f'Schema ({n} {"field" if n == 1 else "fields"})' + f"{table}
" + ) + + if isinstance(schema, list): + # List-of-dicts (frictionless style) or plain list + if schema and isinstance(schema[0], dict): + # Render each dict as a row; use union of all keys as columns + all_keys: list[str] = [] + for item in schema: + for k in item: + if k not in all_keys: + all_keys.append(k) + header_row = "".join(f"{_esc(k)}" for k in all_keys) + body_rows = "" + for item in schema: + cells = "".join(f"{_esc(item.get(k, ''))}" for k in all_keys) + body_rows += f"{cells}" + table = ( + f'' + f"{header_row}{body_rows}
" + ) + else: + items_html = "".join(f"
  • {_esc(s)}
  • " for s in schema) + table = f"" + + n = len(schema) + open_attr = "open" if n <= 8 else "" + return ( + f'
    ' + f'Schema ({n} {"field" if n == 1 else "fields"})' + f"{table}
    " + ) + + return "" + + +# --------------------------------------------------------------------------- +# Preview builders — one function per modality family, all return HTML str +# or None when no loader is available. +# --------------------------------------------------------------------------- + +#: How many rows to show in tabular previews. +_PREVIEW_ROWS = 5 + + +def _obj_to_preview_html(obj) -> str: + """Return the richest HTML string available for *obj*. + + Tries ``_repr_html_()`` first (pandas DataFrame, polars DataFrame, xarray + Dataset, …), then falls back to ``__repr__``. The result is always + wrapped in a ``
    `` so callers can rely on valid HTML. + """ + if hasattr(obj, "_repr_html_"): + try: + h = obj._repr_html_() + if h: + return f'
    {h}
    ' + except Exception: + pass + return f'
    {_esc(repr(obj))}
    ' + + +def _build_preview(dr: "DataResource") -> str | None: + """Return an HTML preview fragment, or None if not possible.""" + fmt = dr.format + modality = dr.modality + sample = dr.sample_path if dr.sample_path else None + + if sample is None: + return None + + if modality == "tabular": + return _preview_tabular(dr, sample) + if modality == "image": + return _preview_image(dr, sample) + if modality == "array": + return _preview_array(dr, sample) + if modality == "timeseries" and fmt in ("wav", "flac", "mp3", "ogg"): + return _preview_audio(dr, sample) + return None + + +# --- tabular --- + + +def _preview_tabular(dr: "DataResource", path: str) -> str | None: + fmt = dr.format + fs = dr.proj.fs + + try: + if fmt == "parquet": + return _preview_parquet(fs, path) + if fmt == "csv": + return _preview_csv(fs, path) + if fmt in ("tsv", "psv"): + sep = "\t" if fmt == "tsv" else "|" + return _preview_csv(fs, path, sep=sep) + if fmt == "arrow": + return _preview_arrow(fs, path) + if fmt == "jsonlines": + return _preview_jsonlines(fs, path) + if fmt == "excel": + return _preview_excel(fs, path) + if fmt in ("sqlite", "duckdb"): + return _preview_sql(fs, path, fmt) + if fmt == "orc": + return _preview_orc(fs, path) + except Exception: + pass + return None + + +def _preview_parquet(fs, path: str) -> str | None: + """Read only the first row group (or N rows from it) — no full file scan.""" + try: + import pyarrow.parquet as pq + + with fs.open(path, "rb") as fh: + pf = pq.ParquetFile(fh) + # read_row_group reads one row group's pages, not the whole file + batch = pf.read_row_group(0) + if batch.num_rows > _PREVIEW_ROWS: + batch = batch.slice(0, _PREVIEW_ROWS) + # Convert to pandas so we get _repr_html_() for free + df = batch.to_pandas() + return _obj_to_preview_html(df) + except ImportError: + pass + try: + # polars can read a row-count-limited slice without decoding the rest + import polars as pl + + with fs.open(path, "rb") as fh: + df = pl.read_parquet(fh, n_rows=_PREVIEW_ROWS) + return _obj_to_preview_html(df) + except ImportError: + pass + return None + + +def _preview_csv(fs, path: str, sep: str = ",") -> str | None: + # pandas nrows= stops parsing after N data lines — minimal I/O + try: + import pandas as pd + + with fs.open(path, "r", encoding="utf-8", errors="replace") as fh: + df = pd.read_csv(fh, sep=sep, nrows=_PREVIEW_ROWS) + return _obj_to_preview_html(df) + except ImportError: + pass + try: + import polars as pl + + with fs.open(path, "rb") as fh: + df = pl.read_csv(fh, n_rows=_PREVIEW_ROWS, separator=sep) + return _obj_to_preview_html(df) + except ImportError: + pass + return None + + +def _preview_arrow(fs, path: str) -> str | None: + """Read only the first record batch — no full file deserialisation.""" + try: + import pyarrow.ipc as ipc + + with fs.open(path, "rb") as fh: + try: + # IPC file format: random-access; read just batch 0 + reader = ipc.open_file(fh) + batch = reader.get_batch(0) + except Exception: + fh.seek(0) + # IPC stream format: sequential; read just the first batch + reader = ipc.open_stream(fh) + batch = reader.read_next_batch() + if batch.num_rows > _PREVIEW_ROWS: + batch = batch.slice(0, _PREVIEW_ROWS) + df = batch.to_pandas() + return _obj_to_preview_html(df) + except ImportError: + pass + return None + + +def _preview_jsonlines(fs, path: str) -> str | None: + # pandas nrows= stops reading after N lines + try: + import pandas as pd + + with fs.open(path, "r", encoding="utf-8", errors="replace") as fh: + df = pd.read_json(fh, lines=True, nrows=_PREVIEW_ROWS) + return _obj_to_preview_html(df) + except ImportError: + pass + return None + + +def _preview_excel(fs, path: str) -> str | None: + # nrows= limits rows read from the sheet + try: + import pandas as pd + + with fs.open(path, "rb") as fh: + df = pd.read_excel(fh, nrows=_PREVIEW_ROWS) + return _obj_to_preview_html(df) + except ImportError: + pass + return None + + +def _preview_sql(fs, path: str, fmt: str) -> str | None: + # SQLite/DuckDB: only works with a local path (not a remote FS) + try: + if getattr(fs, "protocol", "file") not in ("file", "local", ""): + return None + if fmt == "duckdb": + try: + import duckdb + + con = duckdb.connect(path, read_only=True) + tables = con.execute("SHOW TABLES").fetchall() + if not tables: + return None + tname = tables[0][0] + df = con.execute( + f'SELECT * FROM "{tname}" LIMIT {_PREVIEW_ROWS}' + ).fetchdf() + return _obj_to_preview_html(df) + except ImportError: + pass + else: + import sqlite3 + import pandas as pd + + con = sqlite3.connect(path) + cur = con.cursor() + cur.execute("SELECT name FROM sqlite_master WHERE type='table'") + tables = cur.fetchall() + if not tables: + return None + tname = tables[0][0] + df = pd.read_sql(f'SELECT * FROM "{tname}" LIMIT {_PREVIEW_ROWS}', con) + return _obj_to_preview_html(df) + except Exception: + pass + return None + + +def _preview_orc(fs, path: str) -> str | None: + try: + import pyarrow.orc as orc + + with fs.open(path, "rb") as fh: + table = orc.ORCFile(fh).read().slice(0, _PREVIEW_ROWS) + df = table.to_pandas() + return _obj_to_preview_html(df) + except ImportError: + pass + return None + + +# --- image --- + + +def _preview_image(dr: "DataResource", path: str) -> str | None: + try: + from PIL import Image + import io + + fs = dr.proj.fs + with fs.open(path, "rb") as fh: + raw: bytes = fh.read() + + img = Image.open(io.BytesIO(raw)) + img.thumbnail((600, 200)) + + buf = io.BytesIO() + # Save as PNG for lossless display regardless of source format + rgb = img.convert("RGB") if img.mode not in ("RGB", "L", "RGBA") else img + rgb.save(buf, format="PNG") + b64 = base64.b64encode(buf.getvalue()).decode("ascii") + + w, h = img.size + schema = dr.schema if isinstance(dr.schema, dict) else {} + info = f"{schema.get('width', w)}×{schema.get('height', h)}" + if "mode" in schema: + info += f", mode={schema['mode']}" + + return ( + f'
    ' + f'
    {_esc(info)}
    ' + ) + except ImportError: + pass + except Exception: + pass + return None + + +# --- array --- + + +def _preview_array(dr: "DataResource", path: str) -> str | None: + fmt = dr.format + fs = dr.proj.fs + + if fmt == "numpy": + return _preview_numpy(fs, path) + if fmt == "hdf5": + return _preview_hdf5(fs, path) + if fmt == "netcdf": + return _preview_netcdf(fs, path) + if fmt == "zarr": + return _preview_zarr(dr) + return None + + +def _array_info_html(info: dict) -> str: + rows = "".join( + f"{_esc(k)}{_esc(v)}" + for k, v in info.items() + ) + return f'{rows}
    ' + + +def _preview_numpy(fs, path: str) -> str | None: + """Read only the .npy header to get shape/dtype, then load a minimal slice.""" + try: + import numpy as np + import numpy.lib.format as nf + import io + + with fs.open(path, "rb") as fh: + raw_header = fh.read(512) # header is always ≤ 512 bytes + + buf = io.BytesIO(raw_header) + nf.read_magic(buf) + # read_array_header_1_0 is the stable API across numpy versions; + # newer numpy also exposes read_array_header — try both. + try: + shape, _, dtype = nf.read_array_header_1_0(buf) + except AttributeError: + shape, _, dtype = nf.read_array_header(buf) # type: ignore[attr-defined] + + info: dict = {"shape": str(shape), "dtype": str(dtype)} + + # Load the full array only when it's small enough (≤ 1 MB heuristic) + # or when we can cheaply slice the first N rows. + try: + total_elements = 1 + for s in shape: + total_elements *= s + item_size = np.dtype(dtype).itemsize + if total_elements * item_size <= 1_048_576: + with fs.open(path, "rb") as fh: + arr = np.load(io.BytesIO(fh.read()), allow_pickle=False) + sliced = arr[:_PREVIEW_ROWS] if arr.ndim >= 1 else arr + info["preview"] = repr(sliced) + except Exception: + pass + + return _array_info_html(info) + except Exception: + pass + return None + + +def _preview_hdf5(fs, path: str) -> str | None: + """Open the HDF5 file and read only metadata — no array data loaded.""" + try: + import h5py + + with fs.open(path, "rb") as fh: + with h5py.File(fh, "r") as f: + keys = list(f.keys())[:8] + info: dict = {"top-level keys": ", ".join(keys) or "(none)"} + for k in keys[:3]: + obj = f[k] + if hasattr(obj, "shape"): + info[k] = f"shape={obj.shape}, dtype={obj.dtype}" + else: + info[k] = f"group ({len(obj)} members)" + return _array_info_html(info) + except ImportError: + pass + return None + + +def _preview_netcdf(fs, path: str) -> str | None: + """Open the dataset lazily (no data loaded) and render its repr.""" + try: + import xarray as xr + + with fs.open(path, "rb") as fh: + # engine="scipy" reads lazily; no array data is decoded here + ds = xr.open_dataset(fh, engine="scipy") + # xarray Dataset has a rich _repr_html_() + return _obj_to_preview_html(ds) + except ImportError: + pass + return None + + +def _preview_zarr(dr: "DataResource") -> str | None: + """Use the schema cached at parse time — zero extra I/O.""" + schema = dr.schema + if not schema or not isinstance(schema, dict): + return None + info = {} + if "arrays" in schema: + info["arrays"] = ", ".join(str(a) for a in schema["arrays"][:8]) or "(none)" + if "groups" in schema: + info["groups"] = ", ".join(str(g) for g in schema["groups"][:8]) or "(none)" + if "attrs" in schema: + info["attrs"] = str(dict(list(schema["attrs"].items())[:4])) + return _array_info_html(info) if info else None + + +# --- audio --- + + +def _preview_audio(dr: "DataResource", path: str) -> str | None: + """Read only the audio file header — no sample data loaded.""" + try: + import soundfile as sf + + fs = dr.proj.fs + with fs.open(path, "rb") as fh: + info = sf.info(fh) + details = { + "sample rate": f"{info.samplerate} Hz", + "channels": str(info.channels), + "duration": f"{info.frames / info.samplerate:.2f} s", + "format": info.format, + "subtype": info.subtype, + } + return _array_info_html(details) + except ImportError: + pass + return None diff --git a/src/projspec/content/environment.py b/src/projspec/content/environment.py index 1b727a4..e3fe674 100644 --- a/src/projspec/content/environment.py +++ b/src/projspec/content/environment.py @@ -79,11 +79,14 @@ def match(self) -> bool: def parse(self) -> None: import yaml + from projspec.artifact.python_env import CondaEnv - u = self.proj.basenames.get( - "environment.yaml", self.proj.basenames.get("environment.yml") + u = ( + "environment.yaml" + if "environment.yaml" in self.proj.basenames + else "environment.yml" ) - deps = yaml.safe_load(self.proj.fs.open(u, "rt")) + deps = yaml.safe_load(self.proj.get_file(u, text=True)) # TODO: split out pip deps self.contents["environment"] = Environment( stack=Stack.CONDA, @@ -92,3 +95,6 @@ def parse(self) -> None: channels=deps.get("channels"), proj=self.proj, ) + self.artifacts["conda_env"] = CondaEnv( + proj=self.proj, fn=u, cmd=["conda", "env", "create", "-f", u] + ) diff --git a/src/projspec/proj/__init__.py b/src/projspec/proj/__init__.py index b52d2da..929fb17 100644 --- a/src/projspec/proj/__init__.py +++ b/src/projspec/proj/__init__.py @@ -7,6 +7,7 @@ from projspec.proj.briefcase import Briefcase from projspec.proj.conda_package import CondaRecipe, RattlerRecipe from projspec.proj.conda_project import CondaProject +from projspec.proj.data_dir import Data from projspec.proj.datapackage import DataPackage, DVCRepo from projspec.proj.documentation import RTD, MDBook from projspec.proj.git import GitRepo @@ -36,6 +37,7 @@ "Zenodo", "CondaRecipe", "CondaProject", + "Data", "Golang", "GitRepo", "HelmChart", diff --git a/src/projspec/proj/base.py b/src/projspec/proj/base.py index 2d26c26..d519885 100644 --- a/src/projspec/proj/base.py +++ b/src/projspec/proj/base.py @@ -287,7 +287,7 @@ def pyproject(self): def all_artifacts(self, names: str | None = None) -> list: """A flat list of all the artifact objects nested in this project.""" - arts = list(self.artifacts.values()) + arts = list() for spec in self.specs.values(): arts.extend(flatten(spec.artifacts)) for child in self.children.values(): diff --git a/src/projspec/proj/data_dir.py b/src/projspec/proj/data_dir.py new file mode 100644 index 0000000..405fa60 --- /dev/null +++ b/src/projspec/proj/data_dir.py @@ -0,0 +1,757 @@ +"""ProjectSpec for bare data directories. + +Matches directories whose contents are predominantly data files (by extension or +by a recognised on-disk layout such as Hive partitioning, Apache Iceberg, Delta +Lake, or Zarr), with no requirement for any declarative metadata file. +""" + +from __future__ import annotations + +import os +import re +from posixpath import basename as _basename + +from projspec.proj import ProjectSpec, ParseFailed +from projspec.utils import AttrDict + +# --------------------------------------------------------------------------- +# Extension → (canonical format name, modality) +# +# Modality vocabulary from intake's `structure` tags + napari's layer types: +# "tabular" — row/column data +# "array" — N-dimensional arrays +# "image" — 2-D/3-D images (raster) +# "timeseries" — time-indexed signals +# "geospatial" — vector or raster geodata +# "model" — ML model weights / configs +# "nested" — hierarchical / JSON-like +# "document" — human-readable documents +# "video" — video streams +# "archive" — compressed bundles +# +# .json is excluded — too common in non-data contexts (configs, manifests). +# --------------------------------------------------------------------------- +_EXT_TO_FORMAT: dict[str, tuple[str, str]] = { + # Tabular / columnar ------------------------------------------------------- + ".csv": ("csv", "tabular"), + ".tsv": ("tsv", "tabular"), + ".psv": ("psv", "tabular"), + ".parquet": ("parquet", "tabular"), + ".parq": ("parquet", "tabular"), + ".pq": ("parquet", "tabular"), + ".arrow": ("arrow", "tabular"), + ".ipc": ("arrow", "tabular"), + ".feather": ("arrow", "tabular"), # Feather v1/v2 (magic: FEA1 / ARROW1) + ".orc": ("orc", "tabular"), + ".avro": ("avro", "tabular"), + ".xls": ("excel", "tabular"), + ".xlsx": ("excel", "tabular"), + ".xlsm": ("excel", "tabular"), + ".xlsb": ("excel", "tabular"), + ".jsonl": ("jsonlines", "tabular"), + ".ndjson": ("jsonlines", "tabular"), + ".db": ("sqlite", "tabular"), # DuckDB / SQLite (disambiguated by magic) + ".sqlite": ("sqlite", "tabular"), + ".sqlitedb": ("sqlite", "tabular"), + ".duckdb": ("duckdb", "tabular"), + # Array / scientific ------------------------------------------------------- + ".npy": ("numpy", "array"), + ".npz": ("numpy", "array"), + ".hdf5": ("hdf5", "array"), + ".hdf": ("hdf5", "array"), + ".h5": ("hdf5", "array"), + ".h4": ("hdf5", "array"), + ".he5": ("hdf5", "array"), + ".nc": ("netcdf", "array"), + ".nc3": ("netcdf", "array"), + ".nc4": ("netcdf", "array"), + ".mat": ("matlab", "array"), + ".fits": ("fits", "array"), + ".grib": ("grib", "timeseries"), + ".grb": ("grib", "timeseries"), + ".grib2": ("grib", "timeseries"), + ".grb2": ("grib", "timeseries"), + ".asdf": ("asdf", "array"), + ".zarr": ("zarr", "array"), + # Image / biomedical imaging ----------------------------------------------- + ".png": ("png", "image"), + ".jpg": ("jpeg", "image"), + ".jpeg": ("jpeg", "image"), + ".tif": ("tiff", "image"), # also geotiff — ambiguous; image wins + ".tiff": ("tiff", "image"), + ".cog": ("tiff", "geospatial"), # Cloud-Optimised GeoTIFF + ".bmp": ("bmp", "image"), + ".gif": ("gif", "image"), + ".webp": ("webp", "image"), + ".dcm": ("dicom", "image"), + ".dicom": ("dicom", "image"), + ".nii": ("nifti", "image"), + ".nrrd": ("nrrd", "image"), + ".nhdr": ("nrrd", "image"), + ".mha": ("metaimage", "image"), + ".mhd": ("metaimage", "image"), + ".svs": ("svs", "image"), # Aperio whole-slide image + ".ndpi": ("ndpi", "image"), # Hamamatsu whole-slide image + ".scn": ("scn", "image"), # Leica whole-slide image + ".lsm": ("lsm", "image"), # Zeiss confocal + ".exr": ("exr", "image"), # OpenEXR HDR + ".qptiff": ("qptiff", "image"), # PerkinElmer whole-slide + # Geospatial --------------------------------------------------------------- + ".shp": ("shapefile", "geospatial"), + ".shx": ("shapefile", "geospatial"), + ".dbf": ("shapefile", "geospatial"), + ".geojson": ("geojson", "geospatial"), + ".gpkg": ("geopackage", "geospatial"), + ".fgb": ("flatgeobuf", "geospatial"), + ".kml": ("kml", "geospatial"), + ".pmtiles": ("pmtiles", "geospatial"), + # Audio -------------------------------------------------------------------- + ".wav": ("wav", "timeseries"), + ".flac": ("flac", "timeseries"), + ".mp3": ("mp3", "timeseries"), + ".ogg": ("ogg", "timeseries"), + # Video -------------------------------------------------------------------- + ".mp4": ("mp4", "video"), + ".avi": ("avi", "video"), + ".mov": ("mov", "video"), + ".mkv": ("mkv", "video"), + ".webm": ("webm", "video"), + # ML model weights --------------------------------------------------------- + ".safetensors": ("safetensors", "model"), + ".gguf": ("gguf", "model"), + ".pt": ("pytorch", "model"), + ".pth": ("pytorch", "model"), + ".onnx": ("onnx", "model"), + ".tfrec": ("tfrecord", "model"), + # Archive / bundle --------------------------------------------------------- + ".pkl": ("pickle", "archive"), + ".bin": ("binary", "archive"), +} + +_DATA_EXTENSIONS: frozenset[str] = frozenset(_EXT_TO_FORMAT) + +# --------------------------------------------------------------------------- +# Magic-byte signatures (format, modality, offset, bytes_pattern). +# +# Each entry: (format_str, modality_str, offset, pattern) +# offset = int → match at that fixed byte offset +# offset = None → scan anywhere in the first 1 KiB (re.search) +# +# Ordered from most-specific to least-specific (longer / more-offset patterns +# first so they shadow shorter ones that match the same header). +# --------------------------------------------------------------------------- +_MAGIC: list[tuple[str, str, int | None, bytes]] = [ + # Fixed-offset signatures + ("dicom", "image", 128, b"DICM"), # DICOM preamble + ("nifti", "image", 344, b"ni1\x00"), # NIfTI-1 + ("nifti", "image", 344, b"n+1\x00"), # NIfTI-1 single file + ("duckdb", "tabular", 8, b"DUCK"), + ("safetensors", "model", 8, b"{"), # SafeTensors JSON header + ("wav", "timeseries", 8, b"WAVE"), # RIFF…WAVE + # Offset-0 signatures + ("parquet", "tabular", 0, b"PAR1"), + ("hdf5", "array", 0, b"\x89HDF"), + ("netcdf", "array", 0, b"CDF\x01"), # NetCDF classic + ("netcdf", "array", 0, b"CDF\x02"), # NetCDF-64bit + ("orc", "tabular", 0, b"ORC"), + ("avro", "tabular", 0, b"Obj\x01"), + ("arrow", "tabular", 0, b"ARROW1"), # IPC stream + ("arrow", "tabular", 0, b"FEA1"), # Feather v1 + ("numpy", "array", 0, b"\x93NUMPY"), + ("matlab", "array", 0, b"MATLAB"), + ("fits", "array", 0, b"SIMPLE"), + ("grib", "timeseries", 0, b"GRIB"), + ("asdf", "array", 0, b"#ASDF"), + ("flatgeobuf", "geospatial", 0, b"fgb"), + ("gguf", "model", 0, b"GGUF"), + ("png", "image", 0, b"\x89PNG"), + ("jpeg", "image", 0, b"\xff\xd8\xff"), + ("tiff", "image", 0, b"II*\x00"), # little-endian TIFF + ("tiff", "image", 0, b"MM\x00*"), # big-endian TIFF + ("sqlite", "tabular", 0, b"SQLite format"), + ("shapefile", "geospatial", 0, b"\x00\x00\x27\x0a"), + ("pmtiles", "geospatial", 0, b"PMTiles"), +] + +# Regex that matches Hive-style partition directory names (e.g. "year=2024"). +_HIVE_DIR_RE = re.compile(r"^[^=]+=.+$") + + +# --------------------------------------------------------------------------- +# Schema extraction helpers — all imports inside try/except ImportError so +# that missing optional libraries never block parsing. +# --------------------------------------------------------------------------- + + +def _read_schema(path: str, fmt: str, fs) -> dict | list: + """Return a best-effort schema dict/list for *path*, or {} on any failure.""" + try: + if fmt == "parquet": + try: + import pyarrow.parquet as pq + + with fs.open(path, "rb") as fh: + pf = pq.ParquetFile(fh) + return {field.name: str(field.type) for field in pf.schema_arrow} + except ImportError: + pass + + elif fmt == "arrow": + try: + import pyarrow.ipc as ipc + + with fs.open(path, "rb") as fh: + reader = ipc.open_file(fh) + return {field.name: str(field.type) for field in reader.schema} + except ImportError: + pass + + elif fmt == "hdf5": + try: + import h5py + + with fs.open(path, "rb") as fh: + with h5py.File(fh, "r") as ds: + return { + "variables": list(ds.keys()), + "attrs": dict(ds.attrs), + } + except ImportError: + pass + + elif fmt == "netcdf": + try: + import netCDF4 as nc # type: ignore[import] + + with fs.open(path, "rb") as fh: + ds = nc.Dataset("in-mem", memory=fh.read()) + return { + "variables": list(ds.variables.keys()), + "dims": {k: len(v) for k, v in ds.dimensions.items()}, + } + except ImportError: + try: + import xarray as xr # type: ignore[import] + + with fs.open(path, "rb") as fh: + ds = xr.open_dataset(fh, engine="scipy") + return { + "variables": list(ds.data_vars), + "dims": dict(ds.dims), + } + except ImportError: + pass + + elif fmt in ("jpeg", "png", "bmp", "gif", "webp", "tiff"): + try: + from PIL import Image # type: ignore[import] + + with fs.open(path, "rb") as fh: + img = Image.open(fh) + img.load() + mode = img.mode + channels = len(img.getbands()) + return { + "width": img.width, + "height": img.height, + "channels": channels, + "mode": mode, + } + except ImportError: + pass + + elif fmt in ("wav", "flac", "mp3", "ogg"): + try: + import soundfile as sf # type: ignore[import] + + with fs.open(path, "rb") as fh: + info = sf.info(fh) + return { + "sample_rate": info.samplerate, + "channels": info.channels, + "frames": info.frames, + } + except ImportError: + pass + + except Exception: # — never let schema extraction abort parsing + pass + + return {} + + +# --------------------------------------------------------------------------- +# Helpers that work on the already-loaded filelist / basenames +# --------------------------------------------------------------------------- + + +def _filelist_dirs(filelist: list[dict]) -> list[dict]: + """Return only directory entries from a filelist.""" + return [e for e in filelist if e.get("type", "") == "directory"] + + +def _filelist_files(filelist: list[dict]) -> list[dict]: + """Return only file entries from a filelist.""" + return [e for e in filelist if e.get("type", "") != "directory"] + + +def _fmt_from_path(path: str) -> tuple[str, str] | None: + """Return (format, modality) for *path* by extension, or None if unknown.""" + ext = os.path.splitext(path)[1].lower() + return _EXT_TO_FORMAT.get(ext) + + +def _identify_by_magic(path: str, fs) -> tuple[str, str] | None: + """Return (format, modality) by probing *path*'s header bytes, or None. + + Reads up to 1 KiB. Checks fixed-offset patterns first (longer offsets + first, to avoid short patterns shadowing longer ones), then scans for + anywhere-patterns via re.search. + """ + try: + with fs.open(path, "rb") as fh: + head = fh.read(1024) + except Exception: + return None + + for fmt, modality, offset, pattern in _MAGIC: + if offset is None: + if re.search(re.escape(pattern), head): + return fmt, modality + else: + if head[offset : offset + len(pattern)] == pattern: + return fmt, modality + return None + + +# Token that may vary across files in a series: digits, dashes, underscores, dots. +# Alphabetic variation (e.g. "users" vs "orders") disqualifies collation. +_SERIES_VAR_RE = re.compile(r"^[\d\-_.]+$") + + +def _common_affix(stems: list[str]) -> tuple[str, str]: + """Return the longest (prefix, suffix) shared by every stem in *stems*.""" + if not stems: + return "", "" + prefix = os.path.commonprefix(stems) + # Reverse each stem to find common suffix via commonprefix trick + rev = [s[::-1] for s in stems] + suffix = os.path.commonprefix(rev)[::-1] + # Ensure prefix and suffix don't overlap (can happen with a single-char stem) + if len(prefix) + len(suffix) > min(len(s) for s in stems): + suffix = "" + return prefix, suffix + + +def _group_by_naming_series(entries: list[dict]) -> list[list[dict]]: + """Partition *entries* (same-format file list) into naming-series groups. + + Two or more files belong to the same series when their basenames (stems) + differ only in a contiguous segment that consists solely of digits, dashes, + underscores, or dots — i.e. a numeric counter or a date component. + + A single file is always its own series (trivially consistent). + + Returns a list of groups, each group being a non-empty list of entries that + share a common naming pattern. + """ + if len(entries) <= 1: + return [entries] if entries else [] + + # Compute stems once + stems = [os.path.splitext(_basename(e["name"]))[0] for e in entries] + + prefix, suffix = _common_affix(stems) + plen, slen = len(prefix), len(suffix) + + # Extract the variable middle segment for each stem + variables = [] + for stem in stems: + mid = stem[plen : len(stem) - slen if slen else len(stem)] + variables.append(mid) + + # All files form one series if: + # 1. There is a non-trivial shared prefix OR suffix (at least 1 char), AND + # 2. Every variable segment is numeric/date-like (no alphabetic chars) + has_affix = plen >= 1 or slen >= 1 + all_numeric_var = all(_SERIES_VAR_RE.match(v) or v == "" for v in variables) + + if has_affix and all_numeric_var: + return [entries] + + # Otherwise fall back: each file is its own "series" (separate resource) + return [[e] for e in entries] + + +# --------------------------------------------------------------------------- +# Data spec +# --------------------------------------------------------------------------- + +# Sentinel files / directories whose presence indicates a non-data project +# type is also present in this directory. When any of these are found, +# Data.parse() applies the byte-majority test instead of parsing +# unconditionally. +# +# Notably absent: datapackage.json, catalog.yaml/yml, .dvc/ — those belong +# to projspec.proj.datapackage and are treated as compatible companions. +_NON_DATA_SENTINELS: frozenset[str] = frozenset( + { + # Python + "pyproject.toml", + "setup.py", + "setup.cfg", + "hatch.toml", + # Rust + "Cargo.toml", + # JavaScript / Node + "package.json", + # Go + "go.mod", + # Container / infra + "Dockerfile", + "docker-compose.yml", + "docker-compose.yaml", + # Helm + "Chart.yaml", + # Ruby / Java / .NET + "Gemfile", + "pom.xml", + "build.gradle", + "*.csproj", + # R + "DESCRIPTION", + # Conda + "environment.yml", + "environment.yaml", + "meta.yaml", + # Pixi + "pixi.toml", + # Mkdocs / Sphinx / RTD + "mkdocs.yml", + "mkdocs.yaml", + "conf.py", + ".readthedocs.yaml", + ".readthedocs.yml", + # Scripts / notebooks that imply code-first dirs + "Makefile", + } +) + + +class Data(ProjectSpec): + """A directory whose primary contents are data files. + + Matches on any of: + - At least one file with an unambiguous data extension (CSV, Parquet, Arrow, + HDF5, images, audio, etc.) — without requiring a metadata sidecar. + - A recognised directory layout: Hive partitioning (``key=value/`` subdirs), + Apache Iceberg (``metadata/`` directory), Delta Lake (``_delta_log/``), or + a Zarr store (``.zattrs`` / ``.zgroup`` at the root). + + Parsing behaviour + ----------------- + If no non-datapackage project signals are present in the directory the spec + parses unconditionally. If sentinel files that indicate another project type + (``pyproject.toml``, ``Cargo.toml``, ``package.json``, …) are found, parsing + succeeds only when the majority of bytes in the root file listing belong to + recognised data files; otherwise ``ParseFailed`` is raised so that the + directory is not double-counted as both a code project and a data project. + """ + + spec_doc = "https://opencode.ai/docs" # placeholder — no single upstream spec + + # ------------------------------------------------------------------ + # match() + # ------------------------------------------------------------------ + + def match(self) -> bool: + # Fast path: structural layout signals (no file-content inspection needed) + if self._detect_layout(): + return True + # Slow path: any top-level file with an unambiguous data extension + return any( + os.path.splitext(name)[1].lower() in _DATA_EXTENSIONS + for name in self.proj.basenames + ) + + # ------------------------------------------------------------------ + # parse() + # ------------------------------------------------------------------ + + def parse(self) -> None: + from projspec.content.data import ( + DataResource, + ) # local import keeps startup fast + + # If non-datapackage project sentinels are present, only keep this + # spec when data files account for the majority of bytes at the root. + if self._has_non_data_sentinels(): + if not self._data_bytes_majority(): + raise ParseFailed( + "Non-data project sentinels found and data files are not " + "the majority of bytes — skipping Data spec" + ) + + layout = self._detect_layout() + resources: list + + if layout in ("hive", "iceberg", "delta"): + resources = self._parse_layout_dirs(layout) + # Delta/Iceberg also commonly store data files at the root level + # alongside the log/metadata directory; collect those too. + if layout in ("iceberg", "delta"): + root_resources = self._parse_flat() + resources = resources + root_resources + elif layout in ("zarr_store", "tiledarray"): + resources = [self._parse_zarr_root()] + else: + resources = self._parse_flat() + + if not resources: + raise ParseFailed("No recognisable data files found") + + if len(resources) == 1: + self._contents["data_resource"] = resources[0] + else: + self._contents["data_resource"] = AttrDict( + {_safe_key(r.path): r for r in resources} + ) + + # ------------------------------------------------------------------ + # Sentinel / byte-majority helpers + # ------------------------------------------------------------------ + + def _has_non_data_sentinels(self) -> bool: + """Return True if any non-datapackage project sentinel is present.""" + basenames = self.proj.basenames + return any(name in _NON_DATA_SENTINELS for name in basenames) + + def _data_bytes_majority(self) -> bool: + """Return True if data files account for >50 % of root-listing bytes. + + Files with unknown / zero size are excluded from both totals so they + do not unfairly skew the ratio. + """ + total_bytes = 0 + data_bytes = 0 + for entry in self.proj.filelist: + size = entry.get("size") or 0 + if size <= 0: + continue + total_bytes += size + ext = os.path.splitext(entry["name"].rsplit("/", 1)[-1])[1].lower() + if ext in _DATA_EXTENSIONS: + data_bytes += size + if total_bytes == 0: + return False + return data_bytes > total_bytes / 2 + + # ------------------------------------------------------------------ + # Layout detection + # ------------------------------------------------------------------ + + def _detect_layout(self) -> str: + """Return a layout string, or '' if none of the known layouts match. + + Uses the `contains` sentinel approach from intake: certain well-known + files/directories at the root identify a directory as a logical dataset. + """ + basenames = self.proj.basenames + # Zarr store: .zattrs, .zgroup, or zarr.json at the root + # (zarr.json is the Zarr v3 sentinel; .zattrs/.zgroup are v2) + if any(s in basenames for s in (".zattrs", ".zgroup", "zarr.json")): + return "zarr_store" + dir_names = {_basename(e["name"]) for e in _filelist_dirs(self.proj.filelist)} + # Delta Lake + if "_delta_log" in dir_names: + return "delta" + # TileDB array directory + if "__meta" in dir_names and "__schema" in dir_names: + return "tiledarray" + # Apache Iceberg: metadata/ directory present + if "metadata" in dir_names: + return "iceberg" + # Partitioned Parquet: _metadata sentinel file at root (written by Spark/Dask) + if "_metadata" in basenames: + return "iceberg" + # Hive: any top-level subdirectory whose name matches key=value + if any(_HIVE_DIR_RE.match(d) for d in dir_names): + return "hive" + return "" + + # ------------------------------------------------------------------ + # Parsing helpers + # ------------------------------------------------------------------ + + def _resource_from_entries( + self, entries: list[dict], fmt: str, modality: str, layout: str + ): + """Build a DataResource from a list of same-format file entries. + + The ``path`` field is set to: + + - Single file: the bare basename, e.g. ``"data.csv"``. + - Multi-file series: a glob pattern, e.g. ``"part*.csv"``, built from + the shared prefix/suffix of the basenames. + """ + from projspec.content.data import DataResource + + full_paths = [e["name"] for e in entries] + total_size = sum(e.get("size", 0) or 0 for e in entries) + sample_path = full_paths[0] if full_paths else "" + schema = _read_schema(sample_path, fmt, self.proj.fs) if sample_path else {} + + ext = os.path.splitext(_basename(full_paths[0]))[1] if full_paths else "" + + if len(entries) == 1: + path = _basename(full_paths[0]) or fmt + else: + stems = [os.path.splitext(_basename(p))[0] for p in full_paths] + prefix, suffix = _common_affix(stems) + stem_pattern = (prefix.rstrip("-_.") or fmt) + "*" + suffix + path = stem_pattern + ext + + return DataResource( + proj=self.proj, + path=path, + format=fmt, + modality=modality, + layout=layout, + file_count=len(entries), + total_size=total_size, + schema=schema, + sample_path=sample_path, + ) + + def _parse_flat(self) -> list: + """Group top-level files by format and naming series. + + Files of the same format are only collated into a single DataResource + when they share a consistent naming schema — i.e. their stems differ + only in a numeric or date-like segment (e.g. ``part0.csv``, + ``part1.csv`` or ``2024-02.tiff``, ``2024-03.tiff``). Files whose + stems vary in alphabetic content (e.g. ``users.csv``, ``orders.csv``) + each become their own DataResource. + """ + from projspec.content.data import ( + DataResource, + ) # (used via _resource_from_entries) + + # First bucket by (fmt, modality) + fmt_groups: dict[tuple[str, str], list[dict]] = {} + for entry in _filelist_files(self.proj.filelist): + fmt_info = _fmt_from_path(entry["name"]) + if fmt_info is None: + continue + fmt_groups.setdefault(fmt_info, []).append(entry) + + resources = [] + for (fmt, modality), entries in fmt_groups.items(): + # Split each format-group into naming series + for series in _group_by_naming_series(entries): + resources.append( + self._resource_from_entries(series, fmt, modality, "flat") + ) + return resources + + def _parse_layout_dirs(self, layout: str) -> list: + """One DataResource per top-level subdirectory (partition / table root). + + Within each subdirectory the dominant format is determined, then files + are checked for a consistent naming series before collating. + """ + dir_entries = _filelist_dirs(self.proj.filelist) + resources = [] + for dir_entry in dir_entries: + dir_path = dir_entry["name"] + dir_name = _basename(dir_path) + # Skip hidden/internal dirs for iceberg/delta + if layout in ("iceberg", "delta") and dir_name.startswith( + ("metadata", "_delta_log", "_") + ): + continue + # Enumerate files one level inside this subdirectory + try: + sub_filelist = self.proj.fs.ls(dir_path, detail=True) + except Exception: + continue + + sub_files = _filelist_files(sub_filelist) + # Determine dominant (fmt, modality) by file count + fmt_counts: dict[tuple[str, str], int] = {} + for e in sub_files: + fmt_info = _fmt_from_path(e["name"]) + if fmt_info: + fmt_counts[fmt_info] = fmt_counts.get(fmt_info, 0) + 1 + if not fmt_counts: + continue + dominant = max(fmt_counts, key=lambda k: fmt_counts[k]) + dominant_fmt, dominant_modality = dominant + dominant_files = [ + e for e in sub_files if _fmt_from_path(e["name"]) == dominant + ] + resource = self._resource_from_entries( + dominant_files, dominant_fmt, dominant_modality, layout + ) + # Override path with the directory basename + trailing slash + # (partition dirs are already logically grouped by the directory) + resource.path = dir_name + "/" + resources.append(resource) + return resources + + def _parse_zarr_root(self): + """Describe the whole directory as a single array-store resource. + + Used for Zarr stores and TileDB arrays — both are directory-as-dataset + layouts with no individual data files at the root. + """ + from projspec.content.data import DataResource + + url = self.proj.url + layout = self._detect_layout() + # TileDB directories are not Zarr; distinguish the format accordingly + if layout == "tiledarray": + fmt, modality = "tiledb", "array" + schema: dict | list = {} + else: + fmt, modality = "zarr", "array" + schema = {} + try: + import zarr # type: ignore[import] + + store = zarr.open(url, mode="r") + schema = { + "arrays": list(store.array_keys()), + "groups": list(store.group_keys()), + "attrs": dict(store.attrs), + } + except (ImportError, Exception): + pass + + total_size = sum( + e.get("size", 0) or 0 for e in _filelist_files(self.proj.filelist) + ) + return DataResource( + proj=self.proj, + path=(_basename(url) or fmt) + "/", + format=fmt, + modality=modality, + layout=layout, + file_count=len(_filelist_files(self.proj.filelist)), + total_size=total_size, + schema=schema, + sample_path="", + ) + + +# --------------------------------------------------------------------------- +# Utilities +# --------------------------------------------------------------------------- + + +def _safe_key(name: str) -> str: + """Convert an arbitrary name to a valid Python identifier for AttrDict keys.""" + key = re.sub(r"[^0-9a-zA-Z_]", "_", name) + if key and key[0].isdigit(): + key = "_" + key + return key or "_unnamed" diff --git a/tests/test_data_html.py b/tests/test_data_html.py new file mode 100644 index 0000000..2d6e6ea --- /dev/null +++ b/tests/test_data_html.py @@ -0,0 +1,449 @@ +"""Tests for projspec.content.data_html — repr_text and repr_html. + +These tests use a mock DataResource to avoid needing real data files on disk +for basic formatting checks, then run format-specific loader tests when the +required optional libraries are available. +""" + +from __future__ import annotations + +import io +import os +import tempfile +from unittest.mock import MagicMock + +import pytest + +import projspec + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_dr( + path="mytable.parquet", + fmt="parquet", + modality="tabular", + layout="flat", + file_count=3, + total_size=1024 * 512, + schema=None, + sample_path="", + metadata=None, +): + """Build a DataResource backed by a real Project (the repo root) but with + controlled field values.""" + from projspec.content.data import DataResource + + mock_proj = MagicMock(spec=projspec.Project) + # Use a real local filesystem via fsspec + import fsspec + + mock_proj.fs = fsspec.filesystem("file") + mock_proj.url = "/tmp" + + return DataResource( + proj=mock_proj, + path=path, + format=fmt, + modality=modality, + layout=layout, + file_count=file_count, + total_size=total_size, + schema=schema or {}, + sample_path=sample_path, + metadata=metadata or {}, + ) + + +# --------------------------------------------------------------------------- +# repr_text tests +# --------------------------------------------------------------------------- + + +class TestReprText: + def test_basic_fields_present(self): + dr = _make_dr() + text = repr(dr) + assert "mytable.parquet" in text + assert "parquet" in text + assert "tabular" in text + assert "files=3" in text + + def test_size_formatting(self): + dr = _make_dr(total_size=1024) + text = repr(dr) + assert "KB" in text or "B" in text + + def test_size_zero(self): + dr = _make_dr(total_size=0) + text = repr(dr) + assert "unknown" in text + + def test_schema_hint_dict(self): + dr = _make_dr(schema={"col_a": "int64", "col_b": "float32", "col_c": "str"}) + text = repr(dr) + assert "col_a" in text + + def test_schema_hint_many_fields(self): + schema = {f"col_{i}": "int64" for i in range(10)} + dr = _make_dr(schema=schema) + text = repr(dr) + assert "+7 more" in text + + def test_schema_hint_list(self): + dr = _make_dr(schema=[{"name": "a"}, {"name": "b"}]) + text = repr(dr) + assert "2 fields" in text + + def test_non_flat_layout_shown(self): + dr = _make_dr(layout="hive") + text = repr(dr) + assert "hive" in text + + def test_flat_layout_hidden(self): + dr = _make_dr(layout="flat") + text = repr(dr) + assert "layout" not in text + + def test_no_modality(self): + dr = _make_dr(modality="") + text = repr(dr) + assert "modality" not in text + + def test_single_line(self): + dr = _make_dr() + text = repr(dr) + assert "\n" not in text + + def test_path_shown(self): + """repr_text must show the path field, not a separate name.""" + dr = _make_dr(path="part*.csv") + text = repr(dr) + assert "part*.csv" in text + + def test_dir_path_shown(self): + dr = _make_dr(path="year=2024/") + text = repr(dr) + assert "year=2024/" in text + + +# --------------------------------------------------------------------------- +# repr_html tests +# --------------------------------------------------------------------------- + + +class TestReprHtml: + def test_returns_string(self): + dr = _make_dr() + html = dr._repr_html_() + assert isinstance(html, str) + assert len(html) > 0 + + def test_contains_path(self): + dr = _make_dr(path="my_dataset.parquet") + html = dr._repr_html_() + assert "my_dataset.parquet" in html + + def test_contains_glob_path(self): + dr = _make_dr(path="part*.parquet") + html = dr._repr_html_() + assert "part*.parquet" in html + + def test_contains_dir_path(self): + dr = _make_dr(path="year=2024/") + html = dr._repr_html_() + assert "year=2024/" in html + + def test_contains_format_badge(self): + dr = _make_dr(fmt="parquet") + html = dr._repr_html_() + assert "parquet" in html + + def test_contains_modality_badge(self): + dr = _make_dr(modality="tabular") + html = dr._repr_html_() + assert "tabular" in html + + def test_contains_file_count(self): + dr = _make_dr(file_count=7) + html = dr._repr_html_() + assert "7" in html + + def test_contains_size(self): + dr = _make_dr(total_size=2048) + html = dr._repr_html_() + assert "KB" in html or "B" in html + + def test_schema_dict_rendered(self): + dr = _make_dr(schema={"id": "int64", "name": "string"}) + html = dr._repr_html_() + assert "id" in html + assert "int64" in html + + def test_schema_list_of_dicts_rendered(self): + dr = _make_dr( + schema=[ + {"name": "id", "type": "integer"}, + {"name": "val", "type": "number"}, + ] + ) + html = dr._repr_html_() + assert "id" in html + assert "integer" in html + + def test_schema_empty_no_details(self): + dr = _make_dr(schema={}) + html = dr._repr_html_() + assert "Schema" not in html + + def test_no_preview_section_without_sample_path(self): + dr = _make_dr(sample_path="") + html = dr._repr_html_() + assert "Preview" not in html + + def test_layout_badge_shown_for_hive(self): + dr = _make_dr(layout="hive") + html = dr._repr_html_() + assert "hive" in html + + def test_layout_badge_hidden_for_flat(self): + dr = _make_dr(layout="flat") + html = dr._repr_html_() + assert 'ps-badge-gray">flat<' not in html + + def test_html_structure(self): + dr = _make_dr() + html = dr._repr_html_() + assert "ps-data-card" in html + assert "ps-data-card-header" in html + assert "ps-data-meta" in html + + def test_icon_present_for_known_modality(self): + dr = _make_dr(modality="image") + html = dr._repr_html_() + # Image icon is 🖼 (🖼) + assert "🖼" in html + + def test_icon_fallback_for_unknown_modality(self): + dr = _make_dr(modality="") + html = dr._repr_html_() + # Fallback icon 🗂 + assert "🗂" in html + + def test_large_schema_collapsed(self): + schema = {f"col_{i}": "int64" for i in range(20)} + dr = _make_dr(schema=schema) + html = dr._repr_html_() + # details element should NOT have open attribute when >8 fields + assert ( + "
    ' in html + ) + + def test_small_schema_open(self): + schema = {f"col_{i}": "int64" for i in range(4)} + dr = _make_dr(schema=schema) + html = dr._repr_html_() + assert "
    with a dataframe class + assert "dataframe" in html or "ps-df-wrap" in html + + def test_csv_preview_row_limit(self, tmp_path): + """Only _PREVIEW_ROWS rows of data should appear, not all 50.""" + pytest.importorskip("pandas") + import pandas as pd + + path = str(tmp_path / "big.csv") + pd.DataFrame({"v": range(50)}).to_csv(path, index=False) + dr = self._dr_for_file(path, "csv", "tabular") + html = dr._repr_html_() + # Extract just the preview section so CSS text doesn't interfere + preview_start = html.find('
    ') + assert preview_start != -1, "no preview section found" + preview_html = html[preview_start:] + # The last row value (49) should not appear as a table cell + assert "49" not in preview_html + + def test_parquet_preview(self, tmp_path): + pytest.importorskip("pyarrow") + import pyarrow as pa + import pyarrow.parquet as pq + + path = str(tmp_path / "data.parquet") + table = pa.table({"a": [1, 2, 3], "b": ["x", "y", "z"]}) + pq.write_table(table, path) + dr = self._dr_for_file(path, "parquet", "tabular") + html = dr._repr_html_() + assert "Preview" in html + assert " 1 MB threshold + np.save(path, np.zeros((512, 512), dtype="float64")) + dr = self._dr_for_file(path, "numpy", "array") + html = dr._repr_html_() + assert "(512, 512)" in html # shape shown + assert "float64" in html # dtype shown + # The data slice key ("preview") should NOT appear in the info table; + # check the table cell content rather than the CSS class names + assert ">preview<" not in html # no preview row + + +# --------------------------------------------------------------------------- +# fmt_size helper +# --------------------------------------------------------------------------- + + +def test_fmt_size(): + from projspec.content.data_html import _fmt_size + + assert _fmt_size(0) == "unknown" + assert _fmt_size(512) == "512 B" + assert "KB" in _fmt_size(2048) + assert "MB" in _fmt_size(2 * 1024 * 1024) + assert "GB" in _fmt_size(3 * 1024**3) diff --git a/tests/test_data_project.py b/tests/test_data_project.py new file mode 100644 index 0000000..9f71ff0 --- /dev/null +++ b/tests/test_data_project.py @@ -0,0 +1,362 @@ +import json +import os + +import pytest + +import projspec +from projspec.content.data import DataResource +from projspec.utils import from_dict + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _data_project(tmp_path): + """Return a projspec.Project rooted at *tmp_path* (no walk needed).""" + return projspec.Project(str(tmp_path)) + + +# --------------------------------------------------------------------------- +# Detection tests +# --------------------------------------------------------------------------- + + +class TestDataDetection: + def test_csv_detected(self, tmp_path): + (tmp_path / "data.csv").write_text("x,y\n1,2\n3,4\n") + proj = _data_project(tmp_path) + assert "data" in proj.specs + + def test_parquet_detected(self, tmp_path): + pytest.importorskip("pyarrow") + import pyarrow as pa + import pyarrow.parquet as pq + + pq.write_table(pa.table({"a": [1, 2]}), str(tmp_path / "t.parquet")) + proj = _data_project(tmp_path) + assert "data" in proj.specs + + def test_no_data_files_not_detected(self, tmp_path): + (tmp_path / "README.md").write_text("hello") + (tmp_path / "config.json").write_text("{}") + proj = _data_project(tmp_path) + assert "data" not in proj.specs + + +# --------------------------------------------------------------------------- +# Parse / DataResource field tests +# --------------------------------------------------------------------------- + + +class TestDataParse: + def test_single_csv_resource(self, tmp_path): + (tmp_path / "sales.csv").write_text("col1,col2\n1,a\n2,b\n") + proj = _data_project(tmp_path) + dr = proj.specs["data"].contents["data_resource"] + assert isinstance(dr, DataResource) + assert dr.path == "sales.csv" + assert dr.format == "csv" + assert dr.modality == "tabular" + assert dr.file_count == 1 + + def test_series_collated_to_glob_path(self, tmp_path): + """part0.csv + part1.csv → path == 'part*.csv'""" + for i in range(3): + (tmp_path / f"part{i}.csv").write_text("x\n1\n") + proj = _data_project(tmp_path) + dr = proj.specs["data"].contents["data_resource"] + assert isinstance(dr, DataResource) + assert dr.path == "part*.csv" + assert dr.file_count == 3 + + def test_distinct_csv_files_separate_resources(self, tmp_path): + """users.csv and orders.csv differ alphabetically → two resources.""" + (tmp_path / "users.csv").write_text("id\n1\n") + (tmp_path / "orders.csv").write_text("id\n1\n") + proj = _data_project(tmp_path) + dr_map = proj.specs["data"].contents["data_resource"] + # Two separate DataResource objects, keyed in an AttrDict + assert len(dr_map) == 2 + paths = {dr_map[k].path for k in dr_map} + assert "users.csv" in paths + assert "orders.csv" in paths + + def test_sample_path_is_full_path(self, tmp_path): + csv = tmp_path / "data.csv" + csv.write_text("x\n1\n") + proj = _data_project(tmp_path) + dr = proj.specs["data"].contents["data_resource"] + assert dr.sample_path == str(csv) + + def test_total_size_nonzero(self, tmp_path): + content = "x,y\n" + "\n".join(f"{i},{i}" for i in range(20)) + (tmp_path / "nums.csv").write_text(content) + proj = _data_project(tmp_path) + dr = proj.specs["data"].contents["data_resource"] + assert dr.total_size > 0 + + +# --------------------------------------------------------------------------- +# Serialisation: to_dict +# --------------------------------------------------------------------------- + + +class TestDataResourceToDict: + def _make_dr(self, tmp_path): + (tmp_path / "items.csv").write_text("id,val\n1,a\n2,b\n") + proj = _data_project(tmp_path) + return proj.specs["data"].contents["data_resource"] + + def test_compact_omits_klass(self, tmp_path): + dr = self._make_dr(tmp_path) + d = dr.to_dict(compact=True) + assert "klass" not in d + + def test_compact_omits_html(self, tmp_path): + """compact=True is for human/console output — _html must be absent.""" + dr = self._make_dr(tmp_path) + d = dr.to_dict(compact=True) + assert "_html" not in d + + +# --------------------------------------------------------------------------- +# Serialisation: from_dict round-trip +# --------------------------------------------------------------------------- + + +class TestDataResourceRoundTrip: + def _roundtrip(self, dr): + """Serialise to JSON and rehydrate, returning the new DataResource.""" + d = dr.to_dict(compact=False) + js = json.dumps(d) + d2 = json.loads(js) + return from_dict(d2, proj=dr.proj) + + def _make_dr(self, tmp_path): + (tmp_path / "orders.csv").write_text("order_id,amount\n1,99\n2,42\n") + proj = _data_project(tmp_path) + return proj.specs["data"].contents["data_resource"] + + def test_roundtrip_returns_dataresource(self, tmp_path): + dr2 = self._roundtrip(self._make_dr(tmp_path)) + assert isinstance(dr2, DataResource) + + def test_roundtrip_preserves_path(self, tmp_path): + dr2 = self._roundtrip(self._make_dr(tmp_path)) + assert dr2.path == "orders.csv" + + def test_roundtrip_preserves_format(self, tmp_path): + dr2 = self._roundtrip(self._make_dr(tmp_path)) + assert dr2.format == "csv" + + def test_roundtrip_preserves_modality(self, tmp_path): + dr2 = self._roundtrip(self._make_dr(tmp_path)) + assert dr2.modality == "tabular" + + def test_roundtrip_preserves_file_count(self, tmp_path): + dr2 = self._roundtrip(self._make_dr(tmp_path)) + assert dr2.file_count == 1 + + def test_roundtrip_preserves_total_size(self, tmp_path): + dr = self._make_dr(tmp_path) + dr2 = self._roundtrip(dr) + assert dr2.total_size == dr.total_size + + def test_roundtrip_preserves_schema(self, tmp_path): + pytest.importorskip("pyarrow") + import pyarrow as pa, pyarrow.parquet as pq + + pq.write_table( + pa.table({"col_a": [1, 2, 3], "col_b": ["x", "y", "z"]}), + str(tmp_path / "data.parquet"), + ) + proj = _data_project(tmp_path) + dr = proj.specs["data"].contents["data_resource"] + dr2 = self._roundtrip(dr) + assert dr2.schema == dr.schema + + def test_roundtrip_html_matches_original(self, tmp_path): + """_repr_html_() on the rehydrated object must equal the original render.""" + dr = self._make_dr(tmp_path) + html_original = dr._repr_html_() + dr2 = self._roundtrip(dr) + assert dr2._repr_html_() == html_original + + def test_roundtrip_html_cached_without_rerender(self, tmp_path): + """After from_dict the HTML is already in _html — no re-render occurs.""" + dr = self._make_dr(tmp_path) + html_original = dr._repr_html_() + d = dr.to_dict(compact=False) + d2 = json.loads(json.dumps(d)) + dr2 = from_dict(d2, proj=dr.proj) + + # Confirm _html is set directly on the instance (not via lazy render) + assert ( + "_html" in dr2.__dict__ + ), "_html should be in instance __dict__ after from_dict" + assert dr2.__dict__["_html"] == html_original + + def test_roundtrip_html_survives_missing_sample_path(self, tmp_path): + """After rehydration, _repr_html_() must work even if sample_path + no longer resolves (e.g. moved to a different machine).""" + dr = self._make_dr(tmp_path) + # Trigger render with a real file, then remove the file + html_original = dr._repr_html_() + os.remove(dr.sample_path) + + dr2 = self._roundtrip(dr) + # sample_path is gone — but HTML was cached in the dict + assert dr2._repr_html_() == html_original + + +# --------------------------------------------------------------------------- +# Conditional parse: sentinel / byte-majority logic +# --------------------------------------------------------------------------- + + +class TestDataConditionalParse: + """Tests for the 'other project types present' guard in Data.parse().""" + + # -- helpers -- + + def _big_csv(self, path, rows=500): + """Write a CSV large enough to dominate byte counts.""" + content = "id,value\n" + "\n".join(f"{i},{i * 2}" for i in range(rows)) + path.write_text(content) + + # -- pure data directories (no sentinels) -- + + def test_pure_data_dir_no_sentinel(self, tmp_path): + """No sentinel → Data always parsed regardless of byte ratios.""" + (tmp_path / "data.csv").write_text("x\n1\n") + proj = _data_project(tmp_path) + assert "data" in proj.specs + + def test_datapackage_companion_not_a_sentinel(self, tmp_path): + """datapackage.json is a compatible companion — not a sentinel.""" + self._big_csv(tmp_path / "data.csv") + (tmp_path / "datapackage.json").write_text('{"resources": []}') + proj = _data_project(tmp_path) + assert "data" in proj.specs + + def test_dvc_companion_not_a_sentinel(self, tmp_path): + """catalog.yaml (IntakeCatalog / DVCRepo companion) is not a sentinel.""" + self._big_csv(tmp_path / "data.csv") + (tmp_path / "catalog.yaml").write_text("sources: {}") + proj = _data_project(tmp_path) + assert "data" in proj.specs + + # -- mixed dirs where data dominates -- + + def test_sentinel_present_data_majority(self, tmp_path): + """Sentinel present but data files are majority of bytes → Data parsed.""" + self._big_csv(tmp_path / "data.csv") # large data file + (tmp_path / "pyproject.toml").write_text( + "[project]\nname='x'\n" + ) # tiny sentinel + proj = _data_project(tmp_path) + assert "data" in proj.specs + + def test_sentinel_present_data_majority_parquet(self, tmp_path): + pytest.importorskip("pyarrow") + import pyarrow as pa, pyarrow.parquet as pq + + pq.write_table( + pa.table({"x": list(range(1000)), "y": list(range(1000))}), + str(tmp_path / "data.parquet"), + ) + (tmp_path / "Cargo.toml").write_text('[package]\nname="x"\n') + proj = _data_project(tmp_path) + assert "data" in proj.specs + + # -- mixed dirs where non-data dominates -- + + def test_sentinel_present_code_majority(self, tmp_path): + """Sentinel present and code files dominate → Data spec suppressed.""" + # Large Python source file + (tmp_path / "main.py").write_text("x = 1\n" * 5000) + # Tiny CSV + (tmp_path / "tiny.csv").write_text("a,b\n1,2\n") + (tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n") + proj = _data_project(tmp_path) + assert "data" not in proj.specs + + def test_sentinel_present_equal_split_not_majority(self, tmp_path): + """Exactly 50/50 bytes is not a majority — Data suppressed.""" + payload = "x" * 1000 + (tmp_path / "code.py").write_text(payload) + (tmp_path / "data.csv").write_text(payload) + (tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n") + proj = _data_project(tmp_path) + assert "data" not in proj.specs + + # -- helpers / unit tests for the private methods -- + + def test_has_non_data_sentinels_true(self, tmp_path): + from projspec.proj.data_dir import Data + + (tmp_path / "data.csv").write_text("x\n1\n") + (tmp_path / "pyproject.toml").write_text("") + proj = projspec.Project.__new__(projspec.Project) + import fsspec + + proj.fs = fsspec.filesystem("file") + proj.url = str(tmp_path) + proj.__dict__["basenames"] = { + e["name"].rsplit("/", 1)[-1]: e["name"] + for e in proj.fs.ls(str(tmp_path), detail=True) + } + proj.__dict__["filelist"] = proj.fs.ls(str(tmp_path), detail=True) + inst = Data.__new__(Data) + inst.proj = proj + assert inst._has_non_data_sentinels() is True + + def test_has_non_data_sentinels_false(self, tmp_path): + from projspec.proj.data_dir import Data + + (tmp_path / "data.csv").write_text("x\n1\n") + proj = projspec.Project.__new__(projspec.Project) + import fsspec + + proj.fs = fsspec.filesystem("file") + proj.url = str(tmp_path) + proj.__dict__["basenames"] = { + e["name"].rsplit("/", 1)[-1]: e["name"] + for e in proj.fs.ls(str(tmp_path), detail=True) + } + proj.__dict__["filelist"] = proj.fs.ls(str(tmp_path), detail=True) + inst = Data.__new__(Data) + inst.proj = proj + assert inst._has_non_data_sentinels() is False + + def test_data_bytes_majority_true(self, tmp_path): + from projspec.proj.data_dir import Data + + self._big_csv(tmp_path / "data.csv") + (tmp_path / "small.py").write_text("x = 1\n") + proj = projspec.Project.__new__(projspec.Project) + import fsspec + + proj.fs = fsspec.filesystem("file") + proj.url = str(tmp_path) + proj.__dict__["filelist"] = proj.fs.ls(str(tmp_path), detail=True) + inst = Data.__new__(Data) + inst.proj = proj + assert inst._data_bytes_majority() is True + + def test_data_bytes_majority_false(self, tmp_path): + from projspec.proj.data_dir import Data + + (tmp_path / "main.py").write_text("x = 1\n" * 5000) + (tmp_path / "tiny.csv").write_text("a\n1\n") + proj = projspec.Project.__new__(projspec.Project) + import fsspec + + proj.fs = fsspec.filesystem("file") + proj.url = str(tmp_path) + proj.__dict__["filelist"] = proj.fs.ls(str(tmp_path), detail=True) + inst = Data.__new__(Data) + inst.proj = proj + assert inst._data_bytes_majority() is False diff --git a/vsextension/src/extension.ts b/vsextension/src/extension.ts index 13ead47..12b3476 100644 --- a/vsextension/src/extension.ts +++ b/vsextension/src/extension.ts @@ -406,7 +406,7 @@ function getDetailsWebviewContent(projectBasename: string, projectUrl: string, p const infoData = getInfoData(); // Keys that are internal implementation details and add no user-facing value - const SKIP_KEYS = new Set(['klass', 'proc', 'storage_options', 'children', 'url']); + const SKIP_KEYS = new Set(['klass', 'proc', 'storage_options', 'children', 'url', '_html']); // Classify what type of colour-coding a node should get based on where it sits in the tree type NodeRole = 'spec' | 'content' | 'artifact' | 'field' | 'none'; @@ -422,6 +422,8 @@ function getDetailsWebviewContent(projectBasename: string, projectUrl: string, p // For info popups: infoData?: string | null; itemType?: string; + // Pre-rendered HTML from a content object's _html field + htmlContent?: string; } function escapeHtml(s: string): string { @@ -572,6 +574,9 @@ function getDetailsWebviewContent(projectBasename: string, projectUrl: string, p children: children.length > 0 ? children : undefined, infoData: nodeInfoData, itemType: role !== 'none' && role !== 'field' ? role : undefined, + htmlContent: (role === 'content' && value && typeof value === 'object' && !Array.isArray(value) && typeof (value as any)._html === 'string') + ? (value as any)._html + : undefined, }); } @@ -620,6 +625,43 @@ function getDetailsWebviewContent(projectBasename: string, projectUrl: string, p ? `
      ${node.children!.map(c => renderDetailNode(c, depth + 1)).join('')}
    ` : ''; + const htmlPreview = node.htmlContent + ? (() => { + const css = ``; + const srcdoc = (css + node.htmlContent) + .replace(/&/g, '&').replace(/"/g, '"').replace(//g, '>'); + return ``; + })() + : ''; + return `
  • @@ -627,6 +669,7 @@ function getDetailsWebviewContent(projectBasename: string, projectUrl: string, p ${canMake ? `` : ''} ${hasInfoPopup ? `` : ''}
    + ${htmlPreview} ${childrenHtml}
  • `; } @@ -726,6 +769,17 @@ function getDetailsWebviewContent(projectBasename: string, projectUrl: string, p .artifact-node { color: #ce9178; } .field-node { color: var(--vscode-foreground); } + .html-preview { + display: block; + width: 100%; + border: none; + margin-top: 4px; + margin-left: 20px; + /* height is set by the resize observer in JS */ + min-height: 40px; + max-height: 600px; + } + .info-button { width: 20px; height: 20px; border-radius: 50%; background: var(--vscode-button-background);