diff --git a/qtapp/main.py b/qtapp/main.py
index 3e4cd23..4c3fbeb 100644
--- a/qtapp/main.py
+++ b/qtapp/main.py
@@ -272,7 +272,9 @@ def _show_project_details(self, project_url: str, highlight_key: str = ""):
         if proj is None:
             return
         basename = project_url.split("/")[-1] or project_url
-        html = get_details_html(basename, project_url, proj.to_dict(), highlight_key)
+        html = get_details_html(
+            basename, project_url, proj.to_dict(compact=False), highlight_key
+        )
         self.detail.setHtml(html)
 
     def _on_detail_message(self, msg: dict):
@@ -335,7 +337,9 @@ def __init__(self, parent=None):
 
     def refresh(self, scroll_to: str | None = None):
         """Re-render the library HTML panel."""
-        data = {url: proj.to_dict() for url, proj in library.entries.items()}
+        data = {
+            url: proj.to_dict(compact=False) for url, proj in library.entries.items()
+        }
         info_data = class_infos()
         spec_names = list(info_data.get("specs", {}).keys())
         html = get_library_html(data, spec_names, scroll_to_project_url=scroll_to)
diff --git a/qtapp/views.py b/qtapp/views.py
index dbe4812..1b85b76 100644
--- a/qtapp/views.py
+++ b/qtapp/views.py
@@ -163,6 +163,16 @@ def _get_info_data() -> dict:
         border-radius: 50%; animation: spin 0.7s linear infinite; opacity: 0.8;
     }
     @keyframes spin { to { transform: rotate(360deg); } }
+
+    .html-preview {
+        display: block;
+        width: 100%;
+        border: none;
+        margin-top: 4px;
+        margin-left: 20px;
+        min-height: 40px;
+        max-height: 600px;
+    }
 """
 
 _INFO_POPUP_JS = """
@@ -815,7 +825,7 @@ def get_library_html(
 # Details panel
 # ---------------------------------------------------------------------------
 
-_SKIP_KEYS = {"klass", "proc", "storage_options", "children", "url"}
+_SKIP_KEYS = {"klass", "proc", "storage_options", "children", "url", "_html"}
 
 
 def _build_tooltip(doc, link):
@@ -1002,6 +1012,13 @@ def scalar_label(v):
                 "children": children or None,
                 "infoData": node_info_data,
                 "itemType": role if role not in ("none", "field") else None,
+                "htmlContent": (
+                    value["_html"]
+                    if role == "content"
+                    and isinstance(value, dict)
+                    and isinstance(value.get("_html"), str)
+                    else None
+                ),
             }
         )
 
@@ -1017,6 +1034,52 @@ def _is_leaf_artifact(node: dict) -> bool:
     return not any(c.get("role") == "artifact" for c in children)
 
 
+# Dark console-green stylesheet injected into every html-preview srcdoc.
+_HTML_PREVIEW_CSS = (
+    "<style>"
+    ":root{--bg:#0d1117;--bg-hd:#161b22;--bg-alt:#111820;--grn:#39d353;--grn-d:#26a641;"
+    "--grn-m:#196127;--bd:#21262d;--bd-d:#161b22;--fg:#c9d1d9;--fg-d:#8b949e;"
+    "--bb:#1f6feb;--bg2:#30363d;--fn:ui-monospace,'Cascadia Code','Fira Mono',monospace}"
+    "*{box-sizing:border-box}"
+    "body{background:var(--bg);color:var(--fg);margin:0;font-family:var(--fn);font-size:12px}"
+    ".ps-data-card{border:1px solid var(--bd);border-radius:6px;overflow:hidden;"
+    "background:var(--bg);color:var(--fg)}"
+    ".ps-data-card-header{background:var(--bg-hd);padding:7px 12px;display:flex;"
+    "align-items:center;gap:8px;border-bottom:1px solid var(--bd)}"
+    ".ps-data-card-header .ps-icon{font-size:16px}"
+    ".ps-data-card-header .ps-name{font-weight:bold;font-size:13px;color:var(--grn)}"
+    ".ps-data-card-header .ps-badge{background:var(--bb);color:#fff;border-radius:10px;"
+    "padding:1px 7px;font-size:10px}"
+    ".ps-data-card-header .ps-badge-gray{background:var(--bg2);color:var(--fg);"
+    "border-radius:10px;padding:1px 7px;font-size:10px}"
+    ".ps-data-meta{padding:8px 12px;border-bottom:1px solid var(--bd-d)}"
+    ".ps-data-meta table{border-collapse:collapse;width:100%}"
+    ".ps-data-meta td{padding:2px 8px 2px 0;vertical-align:top}"
+    ".ps-data-meta td:first-child{color:var(--fg-d);white-space:nowrap;width:110px}"
+    "details>summary{list-style:none;cursor:pointer;color:var(--grn-d);font-size:11px;margin-top:4px}"
+    "details>summary::-webkit-details-marker{display:none}"
+    ".ps-schema-table{font-size:11px;border-collapse:collapse;margin-top:4px;width:100%}"
+    ".ps-schema-table th{background:var(--bg-hd);color:var(--grn-d);padding:2px 8px;"
+    "text-align:left;border:1px solid var(--bd)}"
+    ".ps-schema-table td{padding:2px 8px;border:1px solid var(--bd-d);font-family:var(--fn);color:var(--fg)}"
+    ".ps-schema-table td strong{color:var(--grn)}"
+    ".ps-preview{padding:8px 12px}"
+    ".ps-preview-title{font-weight:bold;font-size:10px;color:var(--grn-m);margin-bottom:5px;"
+    "text-transform:uppercase;letter-spacing:.8px}"
+    ".ps-df-wrap{overflow-x:auto}"
+    ".ps-df-wrap table,.dataframe{font-size:11px!important;border-collapse:collapse!important;"
+    "width:100%!important;color:var(--fg)!important;background:var(--bg)!important}"
+    ".ps-df-wrap th,.dataframe thead th{background:var(--bg-hd)!important;color:var(--grn-d)!important;"
+    "padding:3px 10px!important;border:1px solid var(--bd)!important;white-space:nowrap;text-align:left!important}"
+    ".ps-df-wrap td,.dataframe tbody td{padding:2px 10px!important;border:1px solid var(--bd-d)!important;"
+    "color:var(--fg)!important;background:var(--bg)!important;white-space:nowrap;"
+    "max-width:200px;overflow:hidden;text-overflow:ellipsis}"
+    ".dataframe tbody tr:nth-child(even) td{background:var(--bg-alt)!important}"
+    ".ps-img-preview{max-width:100%;max-height:200px;border-radius:4px}"
+    "</style>"
+)
+
+
 def _render_detail_node(node: dict, depth: int) -> str:
     has_children = bool(node.get("children"))
     can_make = _is_leaf_artifact(node)
@@ -1067,6 +1130,19 @@ def _render_detail_node(node: dict, depth: int) -> str:
             f'<ul class="tree-children" data-depth="{depth + 1}">{inner}</ul>'
         )
 
+    html_content = node.get("htmlContent")
+    if html_content:
+        srcdoc = (
+            (_HTML_PREVIEW_CSS + html_content)
+            .replace("&", "&amp;")
+            .replace('"', "&quot;")
+            .replace("<", "&lt;")
+            .replace(">", "&gt;")
+        )
+        html_preview = f'<iframe class="html-preview" sandbox="allow-scripts" srcdoc="{srcdoc}"></iframe>'
+    else:
+        html_preview = ""
+
     make_btn = (
         f'<button class="make-button" data-item="{node_data}" title="Make artifact">Make</button>'
         if can_make
@@ -1085,6 +1161,7 @@ def _render_detail_node(node: dict, depth: int) -> str:
             {make_btn}
             {info_btn}
         </div>
+        {html_preview}
         {children_html}
     </li>"""
 
diff --git a/src/projspec/__main__.py b/src/projspec/__main__.py
index ad9eb04..b84fa07 100755
--- a/src/projspec/__main__.py
+++ b/src/projspec/__main__.py
@@ -89,16 +89,30 @@ def version():
     help="List of spec types to ignore (comma-separated list in camel or snake case)",
 )
 @click.option(
-    "--json-out", is_flag=True, default=False, help="JSON output, for projects only"
+    "--json-out",
+    is_flag=True,
+    default=False,
+    help="JSON output, for projects only",
 )
 @click.option(
-    "--html-out", is_flag=True, default=False, help="HTML output, for projects only"
+    "--html-out",
+    is_flag=True,
+    default=False,
+    help="HTML output, for projects only",
 )
 @click.option("--walk", is_flag=True, help="To descend into all child directories")
 @click.option("--summary", is_flag=True, help="Show abbreviated output")
 @click.option("--library", is_flag=True, help="Add to library")
 def scan(
-    path, storage_options, types, xtypes, json_out, html_out, walk, summary, library
+    path,
+    storage_options,
+    types,
+    xtypes,
+    json_out,
+    html_out,
+    walk,
+    summary,
+    library,
 ):
     """Scan the given path for projects, and display
 
@@ -109,13 +123,17 @@ def scan(
     else:
         types = types.split(",")
     proj = projspec.Project(
-        path, storage_options=storage_options, types=types, xtypes=xtypes, walk=walk
+        path,
+        storage_options=storage_options,
+        types=types,
+        xtypes=xtypes,
+        walk=walk,
     )
     if summary:
         print(proj.text_summary())
     else:
         if json_out:
-            print(json.dumps(proj.to_dict(compact=True)))
+            print(json.dumps(proj.to_dict(compact=False)))
         elif html_out:
             print(proj._repr_html_())
         else:
@@ -199,14 +217,21 @@ def library():
 
 @library.command("list")
 @click.option(
-    "--json-out", is_flag=True, default=False, help="JSON output, for projects only"
+    "--json-out",
+    is_flag=True,
+    default=False,
+    help="JSON output, for projects only",
 )
 def list(json_out):
     from projspec.library import ProjectLibrary
 
     library = ProjectLibrary()
     if json_out:
-        print(json.dumps({k: v.to_dict() for k, v in library.entries.items()}))
+        print(
+            json.dumps(
+                {k: v.to_dict(compact=False) for k, v in library.entries.items()}
+            )
+        )
     else:
         for url in sorted(library.entries):
             proj = library.entries[url]
diff --git a/src/projspec/content/data.py b/src/projspec/content/data.py
index d67ff44..c7eddf7 100644
--- a/src/projspec/content/data.py
+++ b/src/projspec/content/data.py
@@ -1,4 +1,5 @@
 """Contents specifying datasets"""
+
 from dataclasses import dataclass, field
 
 from projspec.content import BaseContent
@@ -33,3 +34,74 @@ class IntakeSource(BaseContent):
 
     # TODO: add better fields: args, driver/reader, metadata, description
     name: str
+
+
+@dataclass
+class DataResource(BaseContent):
+    """A data resource found inside a data-only directory.
+
+    Describes one logical dataset — which may be a flat collection of files, a
+    Hive-partitioned tree, an Iceberg/Delta table, a Zarr store, or any other
+    recognised on-disk layout.
+
+    The ``path`` field is a human-readable basename that identifies the resource:
+
+    - Single file: ``"data.csv"``
+    - Multi-file series: ``"part*.parquet"`` (glob-style, common prefix + ``*`` + ext)
+    - Directory-as-dataset (Hive partition, Zarr store, …): ``"year=2024/"``
+
+    The ``modality`` field classifies the broad nature of the data using the
+    vocabulary established by intake's ``structure`` tags and napari's layer
+    type system:
+
+    - ``"tabular"``    — row/column data (CSV, Parquet, ORC, Excel, …)
+    - ``"array"``      — N-dimensional arrays (NumPy, HDF5, NetCDF, Zarr, …)
+    - ``"image"``      — 2-D/3-D images (PNG, JPEG, TIFF, DICOM, NIfTI, …)
+    - ``"timeseries"`` — time-indexed signals (WAV, GRIB, …)
+    - ``"geospatial"`` — vector/raster geodata (Shapefile, GeoJSON, GeoTIFF, …)
+    - ``"model"``      — ML model weights (GGUF, SafeTensors, PyTorch, …)
+    - ``"nested"``     — hierarchical / JSON-like (Avro, YAML, XML, …)
+    - ``"document"``   — human-readable documents (PDF, DOCX, …)
+    - ``"video"``      — video streams (MP4, AVI, …)
+    - ``"archive"``    — compressed bundles (ZIP, tar.gz, …)
+    - ``""``           — unknown / mixed
+
+    The ``schema`` field is format-specific:
+
+    - Tabular (Parquet, Arrow, CSV, …): ``{column_name: dtype_str, …}``
+    - Image / array: ``{"width": int, "height": int, "channels": int, "mode": str}``
+    - Audio: ``{"sample_rate": int, "channels": int, "frames": int}``
+    - HDF5 / Zarr / NetCDF: ``{"variables": [...], "dims": {...}, "attrs": {...}}``
+    - Unknown / library not available: ``{}``
+    """
+
+    path: str  # basename (or glob pattern / dir/ ) identifying this resource
+    format: str  # canonical format string, e.g. "parquet", "csv", "png", "hdf5"
+    modality: str = ""  # broad data nature; see docstring for vocabulary
+    layout: str = ""  # "flat"|"hive"|"iceberg"|"delta"|"zarr_store"|"tiledarray"|""
+    file_count: int = 0
+    total_size: int = 0  # bytes; 0 when unknown (e.g. remote FS without size info)
+    schema: dict | list = field(default_factory=dict)
+    # full path to one representative file, for use by preview loaders
+    sample_path: str = ""
+    metadata: dict = field(default_factory=dict)  # catch-all extras
+    _html = None
+
+    def __repr__(self) -> str:
+        from projspec.content.data_html import repr_text
+
+        return repr_text(self)
+
+    def _repr_html_(self) -> str:
+        """Jupyter rich display — returns cached HTML, rendering on first call."""
+        if self._html is None:
+            from projspec.content.data_html import repr_html
+
+            self._html = repr_html(self)
+        return self._html
+
+    def to_dict(self, compact=False):
+        d = super().to_dict(compact=compact)
+        if not compact:
+            d["_html"] = self._repr_html_()
+        return d
diff --git a/src/projspec/content/data_html.py b/src/projspec/content/data_html.py
new file mode 100644
index 0000000..530fb60
--- /dev/null
+++ b/src/projspec/content/data_html.py
@@ -0,0 +1,632 @@
+"""Text and HTML representations for DataResource.
+
+``repr_text``  — plain-text one-liner for ``__repr__``.
+``repr_html``  — rich HTML card for Jupyter's ``_repr_html_`` protocol.
+
+The HTML card has two sections:
+
+1. **Metadata table** — name, format, modality, layout, file count, total size,
+   schema (collapsed by default when it has many entries).
+
+2. **Preview** (optional) — a lightweight peek at the actual data using
+   whichever optional library is available for the format.  The section is
+   silently omitted when no suitable loader can be imported.
+
+All loader imports are guarded with ``try/except ImportError`` so that the
+representation degrades gracefully when optional dependencies are absent.
+"""
+
+from __future__ import annotations
+
+import base64
+import html as _html
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from projspec.content.data import DataResource
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+_MODALITY_ICON: dict[str, str] = {
+    "tabular": "&#x1F4CA;",  # 📊
+    "image": "&#x1F5BC;",  # 🖼
+    "array": "&#x1F9EE;",  # 🧮
+    "timeseries": "&#x1F4C8;",  # 📈
+    "geospatial": "&#x1F30D;",  # 🌍
+    "model": "&#x1F9E0;",  # 🧠
+    "nested": "&#x1F4C2;",  # 📂
+    "document": "&#x1F4C4;",  # 📄
+    "video": "&#x1F3AC;",  # 🎬
+    "archive": "&#x1F4E6;",  # 📦
+    "": "&#x1F5C2;",  # 🗂
+}
+
+
+def _fmt_size(n: int) -> str:
+    """Human-readable byte count."""
+    if n <= 0:
+        return "unknown"
+    for unit in ("B", "KB", "MB", "GB", "TB"):
+        if n < 1024:
+            return f"{n:.1f} {unit}" if unit != "B" else f"{n} B"
+        n /= 1024  # type: ignore[assignment]
+    return f"{n:.1f} PB"
+
+
+def _esc(s: object) -> str:
+    return _html.escape(str(s))
+
+
+# ---------------------------------------------------------------------------
+# Plain-text repr
+# ---------------------------------------------------------------------------
+
+
+def repr_text(dr: "DataResource") -> str:
+    """One-line text representation of a DataResource."""
+    size = _fmt_size(dr.total_size)
+    schema_hint = ""
+    if isinstance(dr.schema, dict) and dr.schema:
+        keys = list(dr.schema)[:3]
+        extra = f", +{len(dr.schema) - 3} more" if len(dr.schema) > 3 else ""
+        schema_hint = f" [{', '.join(str(k) for k in keys)}{extra}]"
+    elif isinstance(dr.schema, list) and dr.schema:
+        schema_hint = f" [{len(dr.schema)} fields]"
+
+    parts = [
+        f"DataResource({dr.path!r}",
+        f"format={dr.format!r}",
+    ]
+    if dr.modality:
+        parts.append(f"modality={dr.modality!r}")
+    if dr.layout and dr.layout not in ("flat", ""):
+        parts.append(f"layout={dr.layout!r}")
+    parts.append(f"files={dr.file_count}")
+    parts.append(f"size={size}")
+    if schema_hint:
+        parts.append(f"schema={schema_hint.strip()}")
+    return ", ".join(parts) + ")"
+
+
+# ---------------------------------------------------------------------------
+# HTML repr
+# ---------------------------------------------------------------------------
+
+# No inline styles — class names are present for external styling by the
+# host environment (Jupyter, VS Code webview, etc.).
+_CARD_CSS = ""
+
+
+def repr_html(dr: "DataResource") -> str:
+    """Rich HTML card representation of a DataResource."""
+    icon = _MODALITY_ICON.get(dr.modality, _MODALITY_ICON[""])
+    size_str = _fmt_size(dr.total_size)
+
+    # ---- header ----
+    modality_badge = (
+        f'<span class="ps-badge">{_esc(dr.modality)}</span>' if dr.modality else ""
+    )
+    format_badge = f'<span class="ps-badge-gray">{_esc(dr.format)}</span>'
+    layout_badge = (
+        f'<span class="ps-badge-gray">{_esc(dr.layout)}</span>'
+        if dr.layout and dr.layout not in ("flat", "")
+        else ""
+    )
+
+    header = (
+        f'<div class="ps-data-card-header">'
+        f'<span class="ps-icon">{icon}</span>'
+        f'<span class="ps-name">{_esc(dr.path)}</span>'
+        f"{modality_badge}{format_badge}{layout_badge}"
+        f"</div>"
+    )
+
+    # ---- metadata table ----
+    meta_rows = [
+        ("Files", str(dr.file_count)),
+        ("Total size", size_str),
+    ]
+
+    meta_html_rows = "".join(
+        f"<tr><td>{_esc(k)}</td><td>{v}</td></tr>" for k, v in meta_rows
+    )
+    schema_html = _render_schema(dr.schema)
+
+    meta_section = (
+        f'<div class="ps-data-meta">'
+        f"<table>{meta_html_rows}</table>"
+        f"{schema_html}"
+        f"</div>"
+    )
+
+    # ---- preview ----
+    preview_html = _build_preview(dr)
+    preview_section = ""
+    if preview_html:
+        preview_section = (
+            f'<div class="ps-preview">'
+            f'<div class="ps-preview-title">Preview</div>'
+            f"{preview_html}"
+            f"</div>"
+        )
+
+    return (
+        _CARD_CSS
+        + f'<div class="ps-data-card">'
+        + header
+        + meta_section
+        + preview_section
+        + "</div>"
+    )
+
+
+# ---------------------------------------------------------------------------
+# Schema rendering
+# ---------------------------------------------------------------------------
+
+
+def _render_schema(schema: dict | list) -> str:
+    """Render schema as a collapsible HTML block."""
+    if not schema:
+        return ""
+
+    if isinstance(schema, dict):
+        # Tabular-style {col: dtype} or structural {"variables": [...], ...}
+        rows = ""
+        for k, v in schema.items():
+            rows += f"<tr><td>{_esc(k)}</td><td>{_esc(v)}</td></tr>"
+        table = (
+            f'<table class="ps-schema-table">'
+            f"<tr><th>Field</th><th>Type / Value</th></tr>"
+            f"{rows}"
+            f"</table>"
+        )
+        n = len(schema)
+        open_attr = "open" if n <= 8 else ""
+        return (
+            f'<details {open_attr} style="margin-top:6px">'
+            f'<summary class="ps-schema-toggle">Schema ({n} {"field" if n == 1 else "fields"})</summary>'
+            f"{table}</details>"
+        )
+
+    if isinstance(schema, list):
+        # List-of-dicts (frictionless style) or plain list
+        if schema and isinstance(schema[0], dict):
+            # Render each dict as a row; use union of all keys as columns
+            all_keys: list[str] = []
+            for item in schema:
+                for k in item:
+                    if k not in all_keys:
+                        all_keys.append(k)
+            header_row = "".join(f"<th>{_esc(k)}</th>" for k in all_keys)
+            body_rows = ""
+            for item in schema:
+                cells = "".join(f"<td>{_esc(item.get(k, ''))}</td>" for k in all_keys)
+                body_rows += f"<tr>{cells}</tr>"
+            table = (
+                f'<table class="ps-schema-table">'
+                f"<tr>{header_row}</tr>{body_rows}</table>"
+            )
+        else:
+            items_html = "".join(f"<li>{_esc(s)}</li>" for s in schema)
+            table = f"<ul style='margin:4px 0;padding-left:18px'>{items_html}</ul>"
+
+        n = len(schema)
+        open_attr = "open" if n <= 8 else ""
+        return (
+            f'<details {open_attr} style="margin-top:6px">'
+            f'<summary class="ps-schema-toggle">Schema ({n} {"field" if n == 1 else "fields"})</summary>'
+            f"{table}</details>"
+        )
+
+    return ""
+
+
+# ---------------------------------------------------------------------------
+# Preview builders — one function per modality family, all return HTML str
+# or None when no loader is available.
+# ---------------------------------------------------------------------------
+
+#: How many rows to show in tabular previews.
+_PREVIEW_ROWS = 5
+
+
+def _obj_to_preview_html(obj) -> str:
+    """Return the richest HTML string available for *obj*.
+
+    Tries ``_repr_html_()`` first (pandas DataFrame, polars DataFrame, xarray
+    Dataset, …), then falls back to ``__repr__``.  The result is always
+    wrapped in a ``<div>`` so callers can rely on valid HTML.
+    """
+    if hasattr(obj, "_repr_html_"):
+        try:
+            h = obj._repr_html_()
+            if h:
+                return f'<div class="ps-df-wrap">{h}</div>'
+        except Exception:
+            pass
+    return f'<div class="ps-df-wrap"><pre>{_esc(repr(obj))}</pre></div>'
+
+
+def _build_preview(dr: "DataResource") -> str | None:
+    """Return an HTML preview fragment, or None if not possible."""
+    fmt = dr.format
+    modality = dr.modality
+    sample = dr.sample_path if dr.sample_path else None
+
+    if sample is None:
+        return None
+
+    if modality == "tabular":
+        return _preview_tabular(dr, sample)
+    if modality == "image":
+        return _preview_image(dr, sample)
+    if modality == "array":
+        return _preview_array(dr, sample)
+    if modality == "timeseries" and fmt in ("wav", "flac", "mp3", "ogg"):
+        return _preview_audio(dr, sample)
+    return None
+
+
+# --- tabular ---
+
+
+def _preview_tabular(dr: "DataResource", path: str) -> str | None:
+    fmt = dr.format
+    fs = dr.proj.fs
+
+    try:
+        if fmt == "parquet":
+            return _preview_parquet(fs, path)
+        if fmt == "csv":
+            return _preview_csv(fs, path)
+        if fmt in ("tsv", "psv"):
+            sep = "\t" if fmt == "tsv" else "|"
+            return _preview_csv(fs, path, sep=sep)
+        if fmt == "arrow":
+            return _preview_arrow(fs, path)
+        if fmt == "jsonlines":
+            return _preview_jsonlines(fs, path)
+        if fmt == "excel":
+            return _preview_excel(fs, path)
+        if fmt in ("sqlite", "duckdb"):
+            return _preview_sql(fs, path, fmt)
+        if fmt == "orc":
+            return _preview_orc(fs, path)
+    except Exception:
+        pass
+    return None
+
+
+def _preview_parquet(fs, path: str) -> str | None:
+    """Read only the first row group (or N rows from it) — no full file scan."""
+    try:
+        import pyarrow.parquet as pq
+
+        with fs.open(path, "rb") as fh:
+            pf = pq.ParquetFile(fh)
+            # read_row_group reads one row group's pages, not the whole file
+            batch = pf.read_row_group(0)
+            if batch.num_rows > _PREVIEW_ROWS:
+                batch = batch.slice(0, _PREVIEW_ROWS)
+        # Convert to pandas so we get _repr_html_() for free
+        df = batch.to_pandas()
+        return _obj_to_preview_html(df)
+    except ImportError:
+        pass
+    try:
+        # polars can read a row-count-limited slice without decoding the rest
+        import polars as pl
+
+        with fs.open(path, "rb") as fh:
+            df = pl.read_parquet(fh, n_rows=_PREVIEW_ROWS)
+        return _obj_to_preview_html(df)
+    except ImportError:
+        pass
+    return None
+
+
+def _preview_csv(fs, path: str, sep: str = ",") -> str | None:
+    # pandas nrows= stops parsing after N data lines — minimal I/O
+    try:
+        import pandas as pd
+
+        with fs.open(path, "r", encoding="utf-8", errors="replace") as fh:
+            df = pd.read_csv(fh, sep=sep, nrows=_PREVIEW_ROWS)
+        return _obj_to_preview_html(df)
+    except ImportError:
+        pass
+    try:
+        import polars as pl
+
+        with fs.open(path, "rb") as fh:
+            df = pl.read_csv(fh, n_rows=_PREVIEW_ROWS, separator=sep)
+        return _obj_to_preview_html(df)
+    except ImportError:
+        pass
+    return None
+
+
+def _preview_arrow(fs, path: str) -> str | None:
+    """Read only the first record batch — no full file deserialisation."""
+    try:
+        import pyarrow.ipc as ipc
+
+        with fs.open(path, "rb") as fh:
+            try:
+                # IPC file format: random-access; read just batch 0
+                reader = ipc.open_file(fh)
+                batch = reader.get_batch(0)
+            except Exception:
+                fh.seek(0)
+                # IPC stream format: sequential; read just the first batch
+                reader = ipc.open_stream(fh)
+                batch = reader.read_next_batch()
+        if batch.num_rows > _PREVIEW_ROWS:
+            batch = batch.slice(0, _PREVIEW_ROWS)
+        df = batch.to_pandas()
+        return _obj_to_preview_html(df)
+    except ImportError:
+        pass
+    return None
+
+
+def _preview_jsonlines(fs, path: str) -> str | None:
+    # pandas nrows= stops reading after N lines
+    try:
+        import pandas as pd
+
+        with fs.open(path, "r", encoding="utf-8", errors="replace") as fh:
+            df = pd.read_json(fh, lines=True, nrows=_PREVIEW_ROWS)
+        return _obj_to_preview_html(df)
+    except ImportError:
+        pass
+    return None
+
+
+def _preview_excel(fs, path: str) -> str | None:
+    # nrows= limits rows read from the sheet
+    try:
+        import pandas as pd
+
+        with fs.open(path, "rb") as fh:
+            df = pd.read_excel(fh, nrows=_PREVIEW_ROWS)
+        return _obj_to_preview_html(df)
+    except ImportError:
+        pass
+    return None
+
+
+def _preview_sql(fs, path: str, fmt: str) -> str | None:
+    # SQLite/DuckDB: only works with a local path (not a remote FS)
+    try:
+        if getattr(fs, "protocol", "file") not in ("file", "local", ""):
+            return None
+        if fmt == "duckdb":
+            try:
+                import duckdb
+
+                con = duckdb.connect(path, read_only=True)
+                tables = con.execute("SHOW TABLES").fetchall()
+                if not tables:
+                    return None
+                tname = tables[0][0]
+                df = con.execute(
+                    f'SELECT * FROM "{tname}" LIMIT {_PREVIEW_ROWS}'
+                ).fetchdf()
+                return _obj_to_preview_html(df)
+            except ImportError:
+                pass
+        else:
+            import sqlite3
+            import pandas as pd
+
+            con = sqlite3.connect(path)
+            cur = con.cursor()
+            cur.execute("SELECT name FROM sqlite_master WHERE type='table'")
+            tables = cur.fetchall()
+            if not tables:
+                return None
+            tname = tables[0][0]
+            df = pd.read_sql(f'SELECT * FROM "{tname}" LIMIT {_PREVIEW_ROWS}', con)
+            return _obj_to_preview_html(df)
+    except Exception:
+        pass
+    return None
+
+
+def _preview_orc(fs, path: str) -> str | None:
+    try:
+        import pyarrow.orc as orc
+
+        with fs.open(path, "rb") as fh:
+            table = orc.ORCFile(fh).read().slice(0, _PREVIEW_ROWS)
+        df = table.to_pandas()
+        return _obj_to_preview_html(df)
+    except ImportError:
+        pass
+    return None
+
+
+# --- image ---
+
+
+def _preview_image(dr: "DataResource", path: str) -> str | None:
+    try:
+        from PIL import Image
+        import io
+
+        fs = dr.proj.fs
+        with fs.open(path, "rb") as fh:
+            raw: bytes = fh.read()
+
+        img = Image.open(io.BytesIO(raw))
+        img.thumbnail((600, 200))
+
+        buf = io.BytesIO()
+        # Save as PNG for lossless display regardless of source format
+        rgb = img.convert("RGB") if img.mode not in ("RGB", "L", "RGBA") else img
+        rgb.save(buf, format="PNG")
+        b64 = base64.b64encode(buf.getvalue()).decode("ascii")
+
+        w, h = img.size
+        schema = dr.schema if isinstance(dr.schema, dict) else {}
+        info = f"{schema.get('width', w)}×{schema.get('height', h)}"
+        if "mode" in schema:
+            info += f", mode={schema['mode']}"
+
+        return (
+            f'<div><img class="ps-img-preview" src="data:image/png;base64,{b64}" '
+            f'alt="{_esc(dr.path)}" />'
+            f'<div style="font-size:11px;color:#666;margin-top:3px">{_esc(info)}</div></div>'
+        )
+    except ImportError:
+        pass
+    except Exception:
+        pass
+    return None
+
+
+# --- array ---
+
+
+def _preview_array(dr: "DataResource", path: str) -> str | None:
+    fmt = dr.format
+    fs = dr.proj.fs
+
+    if fmt == "numpy":
+        return _preview_numpy(fs, path)
+    if fmt == "hdf5":
+        return _preview_hdf5(fs, path)
+    if fmt == "netcdf":
+        return _preview_netcdf(fs, path)
+    if fmt == "zarr":
+        return _preview_zarr(dr)
+    return None
+
+
+def _array_info_html(info: dict) -> str:
+    rows = "".join(
+        f"<tr><td><strong>{_esc(k)}</strong></td><td>{_esc(v)}</td></tr>"
+        for k, v in info.items()
+    )
+    return f'<table class="ps-schema-table" style="margin-top:0">{rows}</table>'
+
+
+def _preview_numpy(fs, path: str) -> str | None:
+    """Read only the .npy header to get shape/dtype, then load a minimal slice."""
+    try:
+        import numpy as np
+        import numpy.lib.format as nf
+        import io
+
+        with fs.open(path, "rb") as fh:
+            raw_header = fh.read(512)  # header is always ≤ 512 bytes
+
+        buf = io.BytesIO(raw_header)
+        nf.read_magic(buf)
+        # read_array_header_1_0 is the stable API across numpy versions;
+        # newer numpy also exposes read_array_header — try both.
+        try:
+            shape, _, dtype = nf.read_array_header_1_0(buf)
+        except AttributeError:
+            shape, _, dtype = nf.read_array_header(buf)  # type: ignore[attr-defined]
+
+        info: dict = {"shape": str(shape), "dtype": str(dtype)}
+
+        # Load the full array only when it's small enough (≤ 1 MB heuristic)
+        # or when we can cheaply slice the first N rows.
+        try:
+            total_elements = 1
+            for s in shape:
+                total_elements *= s
+            item_size = np.dtype(dtype).itemsize
+            if total_elements * item_size <= 1_048_576:
+                with fs.open(path, "rb") as fh:
+                    arr = np.load(io.BytesIO(fh.read()), allow_pickle=False)
+                sliced = arr[:_PREVIEW_ROWS] if arr.ndim >= 1 else arr
+                info["preview"] = repr(sliced)
+        except Exception:
+            pass
+
+        return _array_info_html(info)
+    except Exception:
+        pass
+    return None
+
+
+def _preview_hdf5(fs, path: str) -> str | None:
+    """Open the HDF5 file and read only metadata — no array data loaded."""
+    try:
+        import h5py
+
+        with fs.open(path, "rb") as fh:
+            with h5py.File(fh, "r") as f:
+                keys = list(f.keys())[:8]
+                info: dict = {"top-level keys": ", ".join(keys) or "(none)"}
+                for k in keys[:3]:
+                    obj = f[k]
+                    if hasattr(obj, "shape"):
+                        info[k] = f"shape={obj.shape}, dtype={obj.dtype}"
+                    else:
+                        info[k] = f"group ({len(obj)} members)"
+        return _array_info_html(info)
+    except ImportError:
+        pass
+    return None
+
+
+def _preview_netcdf(fs, path: str) -> str | None:
+    """Open the dataset lazily (no data loaded) and render its repr."""
+    try:
+        import xarray as xr
+
+        with fs.open(path, "rb") as fh:
+            # engine="scipy" reads lazily; no array data is decoded here
+            ds = xr.open_dataset(fh, engine="scipy")
+        # xarray Dataset has a rich _repr_html_()
+        return _obj_to_preview_html(ds)
+    except ImportError:
+        pass
+    return None
+
+
+def _preview_zarr(dr: "DataResource") -> str | None:
+    """Use the schema cached at parse time — zero extra I/O."""
+    schema = dr.schema
+    if not schema or not isinstance(schema, dict):
+        return None
+    info = {}
+    if "arrays" in schema:
+        info["arrays"] = ", ".join(str(a) for a in schema["arrays"][:8]) or "(none)"
+    if "groups" in schema:
+        info["groups"] = ", ".join(str(g) for g in schema["groups"][:8]) or "(none)"
+    if "attrs" in schema:
+        info["attrs"] = str(dict(list(schema["attrs"].items())[:4]))
+    return _array_info_html(info) if info else None
+
+
+# --- audio ---
+
+
+def _preview_audio(dr: "DataResource", path: str) -> str | None:
+    """Read only the audio file header — no sample data loaded."""
+    try:
+        import soundfile as sf
+
+        fs = dr.proj.fs
+        with fs.open(path, "rb") as fh:
+            info = sf.info(fh)
+        details = {
+            "sample rate": f"{info.samplerate} Hz",
+            "channels": str(info.channels),
+            "duration": f"{info.frames / info.samplerate:.2f} s",
+            "format": info.format,
+            "subtype": info.subtype,
+        }
+        return _array_info_html(details)
+    except ImportError:
+        pass
+    return None
diff --git a/src/projspec/content/environment.py b/src/projspec/content/environment.py
index 1b727a4..e3fe674 100644
--- a/src/projspec/content/environment.py
+++ b/src/projspec/content/environment.py
@@ -79,11 +79,14 @@ def match(self) -> bool:
 
     def parse(self) -> None:
         import yaml
+        from projspec.artifact.python_env import CondaEnv
 
-        u = self.proj.basenames.get(
-            "environment.yaml", self.proj.basenames.get("environment.yml")
+        u = (
+            "environment.yaml"
+            if "environment.yaml" in self.proj.basenames
+            else "environment.yml"
         )
-        deps = yaml.safe_load(self.proj.fs.open(u, "rt"))
+        deps = yaml.safe_load(self.proj.get_file(u, text=True))
         # TODO: split out pip deps
         self.contents["environment"] = Environment(
             stack=Stack.CONDA,
@@ -92,3 +95,6 @@ def parse(self) -> None:
             channels=deps.get("channels"),
             proj=self.proj,
         )
+        self.artifacts["conda_env"] = CondaEnv(
+            proj=self.proj, fn=u, cmd=["conda", "env", "create", "-f", u]
+        )
diff --git a/src/projspec/proj/__init__.py b/src/projspec/proj/__init__.py
index b52d2da..929fb17 100644
--- a/src/projspec/proj/__init__.py
+++ b/src/projspec/proj/__init__.py
@@ -7,6 +7,7 @@
 from projspec.proj.briefcase import Briefcase
 from projspec.proj.conda_package import CondaRecipe, RattlerRecipe
 from projspec.proj.conda_project import CondaProject
+from projspec.proj.data_dir import Data
 from projspec.proj.datapackage import DataPackage, DVCRepo
 from projspec.proj.documentation import RTD, MDBook
 from projspec.proj.git import GitRepo
@@ -36,6 +37,7 @@
     "Zenodo",
     "CondaRecipe",
     "CondaProject",
+    "Data",
     "Golang",
     "GitRepo",
     "HelmChart",
diff --git a/src/projspec/proj/base.py b/src/projspec/proj/base.py
index 2d26c26..d519885 100644
--- a/src/projspec/proj/base.py
+++ b/src/projspec/proj/base.py
@@ -287,7 +287,7 @@ def pyproject(self):
 
     def all_artifacts(self, names: str | None = None) -> list:
         """A flat list of all the artifact objects nested in this project."""
-        arts = list(self.artifacts.values())
+        arts = list()
         for spec in self.specs.values():
             arts.extend(flatten(spec.artifacts))
         for child in self.children.values():
diff --git a/src/projspec/proj/data_dir.py b/src/projspec/proj/data_dir.py
new file mode 100644
index 0000000..405fa60
--- /dev/null
+++ b/src/projspec/proj/data_dir.py
@@ -0,0 +1,757 @@
+"""ProjectSpec for bare data directories.
+
+Matches directories whose contents are predominantly data files (by extension or
+by a recognised on-disk layout such as Hive partitioning, Apache Iceberg, Delta
+Lake, or Zarr), with no requirement for any declarative metadata file.
+"""
+
+from __future__ import annotations
+
+import os
+import re
+from posixpath import basename as _basename
+
+from projspec.proj import ProjectSpec, ParseFailed
+from projspec.utils import AttrDict
+
+# ---------------------------------------------------------------------------
+# Extension → (canonical format name, modality)
+#
+# Modality vocabulary from intake's `structure` tags + napari's layer types:
+#   "tabular"    — row/column data
+#   "array"      — N-dimensional arrays
+#   "image"      — 2-D/3-D images (raster)
+#   "timeseries" — time-indexed signals
+#   "geospatial" — vector or raster geodata
+#   "model"      — ML model weights / configs
+#   "nested"     — hierarchical / JSON-like
+#   "document"   — human-readable documents
+#   "video"      — video streams
+#   "archive"    — compressed bundles
+#
+# .json is excluded — too common in non-data contexts (configs, manifests).
+# ---------------------------------------------------------------------------
+_EXT_TO_FORMAT: dict[str, tuple[str, str]] = {
+    # Tabular / columnar -------------------------------------------------------
+    ".csv": ("csv", "tabular"),
+    ".tsv": ("tsv", "tabular"),
+    ".psv": ("psv", "tabular"),
+    ".parquet": ("parquet", "tabular"),
+    ".parq": ("parquet", "tabular"),
+    ".pq": ("parquet", "tabular"),
+    ".arrow": ("arrow", "tabular"),
+    ".ipc": ("arrow", "tabular"),
+    ".feather": ("arrow", "tabular"),  # Feather v1/v2 (magic: FEA1 / ARROW1)
+    ".orc": ("orc", "tabular"),
+    ".avro": ("avro", "tabular"),
+    ".xls": ("excel", "tabular"),
+    ".xlsx": ("excel", "tabular"),
+    ".xlsm": ("excel", "tabular"),
+    ".xlsb": ("excel", "tabular"),
+    ".jsonl": ("jsonlines", "tabular"),
+    ".ndjson": ("jsonlines", "tabular"),
+    ".db": ("sqlite", "tabular"),  # DuckDB / SQLite (disambiguated by magic)
+    ".sqlite": ("sqlite", "tabular"),
+    ".sqlitedb": ("sqlite", "tabular"),
+    ".duckdb": ("duckdb", "tabular"),
+    # Array / scientific -------------------------------------------------------
+    ".npy": ("numpy", "array"),
+    ".npz": ("numpy", "array"),
+    ".hdf5": ("hdf5", "array"),
+    ".hdf": ("hdf5", "array"),
+    ".h5": ("hdf5", "array"),
+    ".h4": ("hdf5", "array"),
+    ".he5": ("hdf5", "array"),
+    ".nc": ("netcdf", "array"),
+    ".nc3": ("netcdf", "array"),
+    ".nc4": ("netcdf", "array"),
+    ".mat": ("matlab", "array"),
+    ".fits": ("fits", "array"),
+    ".grib": ("grib", "timeseries"),
+    ".grb": ("grib", "timeseries"),
+    ".grib2": ("grib", "timeseries"),
+    ".grb2": ("grib", "timeseries"),
+    ".asdf": ("asdf", "array"),
+    ".zarr": ("zarr", "array"),
+    # Image / biomedical imaging -----------------------------------------------
+    ".png": ("png", "image"),
+    ".jpg": ("jpeg", "image"),
+    ".jpeg": ("jpeg", "image"),
+    ".tif": ("tiff", "image"),  # also geotiff — ambiguous; image wins
+    ".tiff": ("tiff", "image"),
+    ".cog": ("tiff", "geospatial"),  # Cloud-Optimised GeoTIFF
+    ".bmp": ("bmp", "image"),
+    ".gif": ("gif", "image"),
+    ".webp": ("webp", "image"),
+    ".dcm": ("dicom", "image"),
+    ".dicom": ("dicom", "image"),
+    ".nii": ("nifti", "image"),
+    ".nrrd": ("nrrd", "image"),
+    ".nhdr": ("nrrd", "image"),
+    ".mha": ("metaimage", "image"),
+    ".mhd": ("metaimage", "image"),
+    ".svs": ("svs", "image"),  # Aperio whole-slide image
+    ".ndpi": ("ndpi", "image"),  # Hamamatsu whole-slide image
+    ".scn": ("scn", "image"),  # Leica whole-slide image
+    ".lsm": ("lsm", "image"),  # Zeiss confocal
+    ".exr": ("exr", "image"),  # OpenEXR HDR
+    ".qptiff": ("qptiff", "image"),  # PerkinElmer whole-slide
+    # Geospatial ---------------------------------------------------------------
+    ".shp": ("shapefile", "geospatial"),
+    ".shx": ("shapefile", "geospatial"),
+    ".dbf": ("shapefile", "geospatial"),
+    ".geojson": ("geojson", "geospatial"),
+    ".gpkg": ("geopackage", "geospatial"),
+    ".fgb": ("flatgeobuf", "geospatial"),
+    ".kml": ("kml", "geospatial"),
+    ".pmtiles": ("pmtiles", "geospatial"),
+    # Audio --------------------------------------------------------------------
+    ".wav": ("wav", "timeseries"),
+    ".flac": ("flac", "timeseries"),
+    ".mp3": ("mp3", "timeseries"),
+    ".ogg": ("ogg", "timeseries"),
+    # Video --------------------------------------------------------------------
+    ".mp4": ("mp4", "video"),
+    ".avi": ("avi", "video"),
+    ".mov": ("mov", "video"),
+    ".mkv": ("mkv", "video"),
+    ".webm": ("webm", "video"),
+    # ML model weights ---------------------------------------------------------
+    ".safetensors": ("safetensors", "model"),
+    ".gguf": ("gguf", "model"),
+    ".pt": ("pytorch", "model"),
+    ".pth": ("pytorch", "model"),
+    ".onnx": ("onnx", "model"),
+    ".tfrec": ("tfrecord", "model"),
+    # Archive / bundle ---------------------------------------------------------
+    ".pkl": ("pickle", "archive"),
+    ".bin": ("binary", "archive"),
+}
+
+_DATA_EXTENSIONS: frozenset[str] = frozenset(_EXT_TO_FORMAT)
+
+# ---------------------------------------------------------------------------
+# Magic-byte signatures (format, modality, offset, bytes_pattern).
+#
+# Each entry: (format_str, modality_str, offset, pattern)
+#   offset = int  → match at that fixed byte offset
+#   offset = None → scan anywhere in the first 1 KiB (re.search)
+#
+# Ordered from most-specific to least-specific (longer / more-offset patterns
+# first so they shadow shorter ones that match the same header).
+# ---------------------------------------------------------------------------
+_MAGIC: list[tuple[str, str, int | None, bytes]] = [
+    # Fixed-offset signatures
+    ("dicom", "image", 128, b"DICM"),  # DICOM preamble
+    ("nifti", "image", 344, b"ni1\x00"),  # NIfTI-1
+    ("nifti", "image", 344, b"n+1\x00"),  # NIfTI-1 single file
+    ("duckdb", "tabular", 8, b"DUCK"),
+    ("safetensors", "model", 8, b"{"),  # SafeTensors JSON header
+    ("wav", "timeseries", 8, b"WAVE"),  # RIFF…WAVE
+    # Offset-0 signatures
+    ("parquet", "tabular", 0, b"PAR1"),
+    ("hdf5", "array", 0, b"\x89HDF"),
+    ("netcdf", "array", 0, b"CDF\x01"),  # NetCDF classic
+    ("netcdf", "array", 0, b"CDF\x02"),  # NetCDF-64bit
+    ("orc", "tabular", 0, b"ORC"),
+    ("avro", "tabular", 0, b"Obj\x01"),
+    ("arrow", "tabular", 0, b"ARROW1"),  # IPC stream
+    ("arrow", "tabular", 0, b"FEA1"),  # Feather v1
+    ("numpy", "array", 0, b"\x93NUMPY"),
+    ("matlab", "array", 0, b"MATLAB"),
+    ("fits", "array", 0, b"SIMPLE"),
+    ("grib", "timeseries", 0, b"GRIB"),
+    ("asdf", "array", 0, b"#ASDF"),
+    ("flatgeobuf", "geospatial", 0, b"fgb"),
+    ("gguf", "model", 0, b"GGUF"),
+    ("png", "image", 0, b"\x89PNG"),
+    ("jpeg", "image", 0, b"\xff\xd8\xff"),
+    ("tiff", "image", 0, b"II*\x00"),  # little-endian TIFF
+    ("tiff", "image", 0, b"MM\x00*"),  # big-endian TIFF
+    ("sqlite", "tabular", 0, b"SQLite format"),
+    ("shapefile", "geospatial", 0, b"\x00\x00\x27\x0a"),
+    ("pmtiles", "geospatial", 0, b"PMTiles"),
+]
+
+# Regex that matches Hive-style partition directory names (e.g. "year=2024").
+_HIVE_DIR_RE = re.compile(r"^[^=]+=.+$")
+
+
+# ---------------------------------------------------------------------------
+# Schema extraction helpers — all imports inside try/except ImportError so
+# that missing optional libraries never block parsing.
+# ---------------------------------------------------------------------------
+
+
+def _read_schema(path: str, fmt: str, fs) -> dict | list:
+    """Return a best-effort schema dict/list for *path*, or {} on any failure."""
+    try:
+        if fmt == "parquet":
+            try:
+                import pyarrow.parquet as pq
+
+                with fs.open(path, "rb") as fh:
+                    pf = pq.ParquetFile(fh)
+                    return {field.name: str(field.type) for field in pf.schema_arrow}
+            except ImportError:
+                pass
+
+        elif fmt == "arrow":
+            try:
+                import pyarrow.ipc as ipc
+
+                with fs.open(path, "rb") as fh:
+                    reader = ipc.open_file(fh)
+                    return {field.name: str(field.type) for field in reader.schema}
+            except ImportError:
+                pass
+
+        elif fmt == "hdf5":
+            try:
+                import h5py
+
+                with fs.open(path, "rb") as fh:
+                    with h5py.File(fh, "r") as ds:
+                        return {
+                            "variables": list(ds.keys()),
+                            "attrs": dict(ds.attrs),
+                        }
+            except ImportError:
+                pass
+
+        elif fmt == "netcdf":
+            try:
+                import netCDF4 as nc  # type: ignore[import]
+
+                with fs.open(path, "rb") as fh:
+                    ds = nc.Dataset("in-mem", memory=fh.read())
+                    return {
+                        "variables": list(ds.variables.keys()),
+                        "dims": {k: len(v) for k, v in ds.dimensions.items()},
+                    }
+            except ImportError:
+                try:
+                    import xarray as xr  # type: ignore[import]
+
+                    with fs.open(path, "rb") as fh:
+                        ds = xr.open_dataset(fh, engine="scipy")
+                        return {
+                            "variables": list(ds.data_vars),
+                            "dims": dict(ds.dims),
+                        }
+                except ImportError:
+                    pass
+
+        elif fmt in ("jpeg", "png", "bmp", "gif", "webp", "tiff"):
+            try:
+                from PIL import Image  # type: ignore[import]
+
+                with fs.open(path, "rb") as fh:
+                    img = Image.open(fh)
+                    img.load()
+                    mode = img.mode
+                    channels = len(img.getbands())
+                    return {
+                        "width": img.width,
+                        "height": img.height,
+                        "channels": channels,
+                        "mode": mode,
+                    }
+            except ImportError:
+                pass
+
+        elif fmt in ("wav", "flac", "mp3", "ogg"):
+            try:
+                import soundfile as sf  # type: ignore[import]
+
+                with fs.open(path, "rb") as fh:
+                    info = sf.info(fh)
+                    return {
+                        "sample_rate": info.samplerate,
+                        "channels": info.channels,
+                        "frames": info.frames,
+                    }
+            except ImportError:
+                pass
+
+    except Exception:  # — never let schema extraction abort parsing
+        pass
+
+    return {}
+
+
+# ---------------------------------------------------------------------------
+# Helpers that work on the already-loaded filelist / basenames
+# ---------------------------------------------------------------------------
+
+
+def _filelist_dirs(filelist: list[dict]) -> list[dict]:
+    """Return only directory entries from a filelist."""
+    return [e for e in filelist if e.get("type", "") == "directory"]
+
+
+def _filelist_files(filelist: list[dict]) -> list[dict]:
+    """Return only file entries from a filelist."""
+    return [e for e in filelist if e.get("type", "") != "directory"]
+
+
+def _fmt_from_path(path: str) -> tuple[str, str] | None:
+    """Return (format, modality) for *path* by extension, or None if unknown."""
+    ext = os.path.splitext(path)[1].lower()
+    return _EXT_TO_FORMAT.get(ext)
+
+
+def _identify_by_magic(path: str, fs) -> tuple[str, str] | None:
+    """Return (format, modality) by probing *path*'s header bytes, or None.
+
+    Reads up to 1 KiB.  Checks fixed-offset patterns first (longer offsets
+    first, to avoid short patterns shadowing longer ones), then scans for
+    anywhere-patterns via re.search.
+    """
+    try:
+        with fs.open(path, "rb") as fh:
+            head = fh.read(1024)
+    except Exception:
+        return None
+
+    for fmt, modality, offset, pattern in _MAGIC:
+        if offset is None:
+            if re.search(re.escape(pattern), head):
+                return fmt, modality
+        else:
+            if head[offset : offset + len(pattern)] == pattern:
+                return fmt, modality
+    return None
+
+
+# Token that may vary across files in a series: digits, dashes, underscores, dots.
+# Alphabetic variation (e.g. "users" vs "orders") disqualifies collation.
+_SERIES_VAR_RE = re.compile(r"^[\d\-_.]+$")
+
+
+def _common_affix(stems: list[str]) -> tuple[str, str]:
+    """Return the longest (prefix, suffix) shared by every stem in *stems*."""
+    if not stems:
+        return "", ""
+    prefix = os.path.commonprefix(stems)
+    # Reverse each stem to find common suffix via commonprefix trick
+    rev = [s[::-1] for s in stems]
+    suffix = os.path.commonprefix(rev)[::-1]
+    # Ensure prefix and suffix don't overlap (can happen with a single-char stem)
+    if len(prefix) + len(suffix) > min(len(s) for s in stems):
+        suffix = ""
+    return prefix, suffix
+
+
+def _group_by_naming_series(entries: list[dict]) -> list[list[dict]]:
+    """Partition *entries* (same-format file list) into naming-series groups.
+
+    Two or more files belong to the same series when their basenames (stems)
+    differ only in a contiguous segment that consists solely of digits, dashes,
+    underscores, or dots — i.e. a numeric counter or a date component.
+
+    A single file is always its own series (trivially consistent).
+
+    Returns a list of groups, each group being a non-empty list of entries that
+    share a common naming pattern.
+    """
+    if len(entries) <= 1:
+        return [entries] if entries else []
+
+    # Compute stems once
+    stems = [os.path.splitext(_basename(e["name"]))[0] for e in entries]
+
+    prefix, suffix = _common_affix(stems)
+    plen, slen = len(prefix), len(suffix)
+
+    # Extract the variable middle segment for each stem
+    variables = []
+    for stem in stems:
+        mid = stem[plen : len(stem) - slen if slen else len(stem)]
+        variables.append(mid)
+
+    # All files form one series if:
+    #   1. There is a non-trivial shared prefix OR suffix (at least 1 char), AND
+    #   2. Every variable segment is numeric/date-like (no alphabetic chars)
+    has_affix = plen >= 1 or slen >= 1
+    all_numeric_var = all(_SERIES_VAR_RE.match(v) or v == "" for v in variables)
+
+    if has_affix and all_numeric_var:
+        return [entries]
+
+    # Otherwise fall back: each file is its own "series" (separate resource)
+    return [[e] for e in entries]
+
+
+# ---------------------------------------------------------------------------
+# Data spec
+# ---------------------------------------------------------------------------
+
+# Sentinel files / directories whose presence indicates a non-data project
+# type is also present in this directory.  When any of these are found,
+# Data.parse() applies the byte-majority test instead of parsing
+# unconditionally.
+#
+# Notably absent: datapackage.json, catalog.yaml/yml, .dvc/ — those belong
+# to projspec.proj.datapackage and are treated as compatible companions.
+_NON_DATA_SENTINELS: frozenset[str] = frozenset(
+    {
+        # Python
+        "pyproject.toml",
+        "setup.py",
+        "setup.cfg",
+        "hatch.toml",
+        # Rust
+        "Cargo.toml",
+        # JavaScript / Node
+        "package.json",
+        # Go
+        "go.mod",
+        # Container / infra
+        "Dockerfile",
+        "docker-compose.yml",
+        "docker-compose.yaml",
+        # Helm
+        "Chart.yaml",
+        # Ruby / Java / .NET
+        "Gemfile",
+        "pom.xml",
+        "build.gradle",
+        "*.csproj",
+        # R
+        "DESCRIPTION",
+        # Conda
+        "environment.yml",
+        "environment.yaml",
+        "meta.yaml",
+        # Pixi
+        "pixi.toml",
+        # Mkdocs / Sphinx / RTD
+        "mkdocs.yml",
+        "mkdocs.yaml",
+        "conf.py",
+        ".readthedocs.yaml",
+        ".readthedocs.yml",
+        # Scripts / notebooks that imply code-first dirs
+        "Makefile",
+    }
+)
+
+
+class Data(ProjectSpec):
+    """A directory whose primary contents are data files.
+
+    Matches on any of:
+    - At least one file with an unambiguous data extension (CSV, Parquet, Arrow,
+      HDF5, images, audio, etc.) — without requiring a metadata sidecar.
+    - A recognised directory layout: Hive partitioning (``key=value/`` subdirs),
+      Apache Iceberg (``metadata/`` directory), Delta Lake (``_delta_log/``), or
+      a Zarr store (``.zattrs`` / ``.zgroup`` at the root).
+
+    Parsing behaviour
+    -----------------
+    If no non-datapackage project signals are present in the directory the spec
+    parses unconditionally.  If sentinel files that indicate another project type
+    (``pyproject.toml``, ``Cargo.toml``, ``package.json``, …) are found, parsing
+    succeeds only when the majority of bytes in the root file listing belong to
+    recognised data files; otherwise ``ParseFailed`` is raised so that the
+    directory is not double-counted as both a code project and a data project.
+    """
+
+    spec_doc = "https://opencode.ai/docs"  # placeholder — no single upstream spec
+
+    # ------------------------------------------------------------------
+    # match()
+    # ------------------------------------------------------------------
+
+    def match(self) -> bool:
+        # Fast path: structural layout signals (no file-content inspection needed)
+        if self._detect_layout():
+            return True
+        # Slow path: any top-level file with an unambiguous data extension
+        return any(
+            os.path.splitext(name)[1].lower() in _DATA_EXTENSIONS
+            for name in self.proj.basenames
+        )
+
+    # ------------------------------------------------------------------
+    # parse()
+    # ------------------------------------------------------------------
+
+    def parse(self) -> None:
+        from projspec.content.data import (
+            DataResource,
+        )  # local import keeps startup fast
+
+        # If non-datapackage project sentinels are present, only keep this
+        # spec when data files account for the majority of bytes at the root.
+        if self._has_non_data_sentinels():
+            if not self._data_bytes_majority():
+                raise ParseFailed(
+                    "Non-data project sentinels found and data files are not "
+                    "the majority of bytes — skipping Data spec"
+                )
+
+        layout = self._detect_layout()
+        resources: list
+
+        if layout in ("hive", "iceberg", "delta"):
+            resources = self._parse_layout_dirs(layout)
+            # Delta/Iceberg also commonly store data files at the root level
+            # alongside the log/metadata directory; collect those too.
+            if layout in ("iceberg", "delta"):
+                root_resources = self._parse_flat()
+                resources = resources + root_resources
+        elif layout in ("zarr_store", "tiledarray"):
+            resources = [self._parse_zarr_root()]
+        else:
+            resources = self._parse_flat()
+
+        if not resources:
+            raise ParseFailed("No recognisable data files found")
+
+        if len(resources) == 1:
+            self._contents["data_resource"] = resources[0]
+        else:
+            self._contents["data_resource"] = AttrDict(
+                {_safe_key(r.path): r for r in resources}
+            )
+
+    # ------------------------------------------------------------------
+    # Sentinel / byte-majority helpers
+    # ------------------------------------------------------------------
+
+    def _has_non_data_sentinels(self) -> bool:
+        """Return True if any non-datapackage project sentinel is present."""
+        basenames = self.proj.basenames
+        return any(name in _NON_DATA_SENTINELS for name in basenames)
+
+    def _data_bytes_majority(self) -> bool:
+        """Return True if data files account for >50 % of root-listing bytes.
+
+        Files with unknown / zero size are excluded from both totals so they
+        do not unfairly skew the ratio.
+        """
+        total_bytes = 0
+        data_bytes = 0
+        for entry in self.proj.filelist:
+            size = entry.get("size") or 0
+            if size <= 0:
+                continue
+            total_bytes += size
+            ext = os.path.splitext(entry["name"].rsplit("/", 1)[-1])[1].lower()
+            if ext in _DATA_EXTENSIONS:
+                data_bytes += size
+        if total_bytes == 0:
+            return False
+        return data_bytes > total_bytes / 2
+
+    # ------------------------------------------------------------------
+    # Layout detection
+    # ------------------------------------------------------------------
+
+    def _detect_layout(self) -> str:
+        """Return a layout string, or '' if none of the known layouts match.
+
+        Uses the `contains` sentinel approach from intake: certain well-known
+        files/directories at the root identify a directory as a logical dataset.
+        """
+        basenames = self.proj.basenames
+        # Zarr store: .zattrs, .zgroup, or zarr.json at the root
+        # (zarr.json is the Zarr v3 sentinel; .zattrs/.zgroup are v2)
+        if any(s in basenames for s in (".zattrs", ".zgroup", "zarr.json")):
+            return "zarr_store"
+        dir_names = {_basename(e["name"]) for e in _filelist_dirs(self.proj.filelist)}
+        # Delta Lake
+        if "_delta_log" in dir_names:
+            return "delta"
+        # TileDB array directory
+        if "__meta" in dir_names and "__schema" in dir_names:
+            return "tiledarray"
+        # Apache Iceberg: metadata/ directory present
+        if "metadata" in dir_names:
+            return "iceberg"
+        # Partitioned Parquet: _metadata sentinel file at root (written by Spark/Dask)
+        if "_metadata" in basenames:
+            return "iceberg"
+        # Hive: any top-level subdirectory whose name matches key=value
+        if any(_HIVE_DIR_RE.match(d) for d in dir_names):
+            return "hive"
+        return ""
+
+    # ------------------------------------------------------------------
+    # Parsing helpers
+    # ------------------------------------------------------------------
+
+    def _resource_from_entries(
+        self, entries: list[dict], fmt: str, modality: str, layout: str
+    ):
+        """Build a DataResource from a list of same-format file entries.
+
+        The ``path`` field is set to:
+
+        - Single file: the bare basename, e.g. ``"data.csv"``.
+        - Multi-file series: a glob pattern, e.g. ``"part*.csv"``, built from
+          the shared prefix/suffix of the basenames.
+        """
+        from projspec.content.data import DataResource
+
+        full_paths = [e["name"] for e in entries]
+        total_size = sum(e.get("size", 0) or 0 for e in entries)
+        sample_path = full_paths[0] if full_paths else ""
+        schema = _read_schema(sample_path, fmt, self.proj.fs) if sample_path else {}
+
+        ext = os.path.splitext(_basename(full_paths[0]))[1] if full_paths else ""
+
+        if len(entries) == 1:
+            path = _basename(full_paths[0]) or fmt
+        else:
+            stems = [os.path.splitext(_basename(p))[0] for p in full_paths]
+            prefix, suffix = _common_affix(stems)
+            stem_pattern = (prefix.rstrip("-_.") or fmt) + "*" + suffix
+            path = stem_pattern + ext
+
+        return DataResource(
+            proj=self.proj,
+            path=path,
+            format=fmt,
+            modality=modality,
+            layout=layout,
+            file_count=len(entries),
+            total_size=total_size,
+            schema=schema,
+            sample_path=sample_path,
+        )
+
+    def _parse_flat(self) -> list:
+        """Group top-level files by format and naming series.
+
+        Files of the same format are only collated into a single DataResource
+        when they share a consistent naming schema — i.e. their stems differ
+        only in a numeric or date-like segment (e.g. ``part0.csv``,
+        ``part1.csv`` or ``2024-02.tiff``, ``2024-03.tiff``).  Files whose
+        stems vary in alphabetic content (e.g. ``users.csv``, ``orders.csv``)
+        each become their own DataResource.
+        """
+        from projspec.content.data import (
+            DataResource,
+        )  # (used via _resource_from_entries)
+
+        # First bucket by (fmt, modality)
+        fmt_groups: dict[tuple[str, str], list[dict]] = {}
+        for entry in _filelist_files(self.proj.filelist):
+            fmt_info = _fmt_from_path(entry["name"])
+            if fmt_info is None:
+                continue
+            fmt_groups.setdefault(fmt_info, []).append(entry)
+
+        resources = []
+        for (fmt, modality), entries in fmt_groups.items():
+            # Split each format-group into naming series
+            for series in _group_by_naming_series(entries):
+                resources.append(
+                    self._resource_from_entries(series, fmt, modality, "flat")
+                )
+        return resources
+
+    def _parse_layout_dirs(self, layout: str) -> list:
+        """One DataResource per top-level subdirectory (partition / table root).
+
+        Within each subdirectory the dominant format is determined, then files
+        are checked for a consistent naming series before collating.
+        """
+        dir_entries = _filelist_dirs(self.proj.filelist)
+        resources = []
+        for dir_entry in dir_entries:
+            dir_path = dir_entry["name"]
+            dir_name = _basename(dir_path)
+            # Skip hidden/internal dirs for iceberg/delta
+            if layout in ("iceberg", "delta") and dir_name.startswith(
+                ("metadata", "_delta_log", "_")
+            ):
+                continue
+            # Enumerate files one level inside this subdirectory
+            try:
+                sub_filelist = self.proj.fs.ls(dir_path, detail=True)
+            except Exception:
+                continue
+
+            sub_files = _filelist_files(sub_filelist)
+            # Determine dominant (fmt, modality) by file count
+            fmt_counts: dict[tuple[str, str], int] = {}
+            for e in sub_files:
+                fmt_info = _fmt_from_path(e["name"])
+                if fmt_info:
+                    fmt_counts[fmt_info] = fmt_counts.get(fmt_info, 0) + 1
+            if not fmt_counts:
+                continue
+            dominant = max(fmt_counts, key=lambda k: fmt_counts[k])
+            dominant_fmt, dominant_modality = dominant
+            dominant_files = [
+                e for e in sub_files if _fmt_from_path(e["name"]) == dominant
+            ]
+            resource = self._resource_from_entries(
+                dominant_files, dominant_fmt, dominant_modality, layout
+            )
+            # Override path with the directory basename + trailing slash
+            # (partition dirs are already logically grouped by the directory)
+            resource.path = dir_name + "/"
+            resources.append(resource)
+        return resources
+
+    def _parse_zarr_root(self):
+        """Describe the whole directory as a single array-store resource.
+
+        Used for Zarr stores and TileDB arrays — both are directory-as-dataset
+        layouts with no individual data files at the root.
+        """
+        from projspec.content.data import DataResource
+
+        url = self.proj.url
+        layout = self._detect_layout()
+        # TileDB directories are not Zarr; distinguish the format accordingly
+        if layout == "tiledarray":
+            fmt, modality = "tiledb", "array"
+            schema: dict | list = {}
+        else:
+            fmt, modality = "zarr", "array"
+            schema = {}
+            try:
+                import zarr  # type: ignore[import]
+
+                store = zarr.open(url, mode="r")
+                schema = {
+                    "arrays": list(store.array_keys()),
+                    "groups": list(store.group_keys()),
+                    "attrs": dict(store.attrs),
+                }
+            except (ImportError, Exception):
+                pass
+
+        total_size = sum(
+            e.get("size", 0) or 0 for e in _filelist_files(self.proj.filelist)
+        )
+        return DataResource(
+            proj=self.proj,
+            path=(_basename(url) or fmt) + "/",
+            format=fmt,
+            modality=modality,
+            layout=layout,
+            file_count=len(_filelist_files(self.proj.filelist)),
+            total_size=total_size,
+            schema=schema,
+            sample_path="",
+        )
+
+
+# ---------------------------------------------------------------------------
+# Utilities
+# ---------------------------------------------------------------------------
+
+
+def _safe_key(name: str) -> str:
+    """Convert an arbitrary name to a valid Python identifier for AttrDict keys."""
+    key = re.sub(r"[^0-9a-zA-Z_]", "_", name)
+    if key and key[0].isdigit():
+        key = "_" + key
+    return key or "_unnamed"
diff --git a/tests/test_data_html.py b/tests/test_data_html.py
new file mode 100644
index 0000000..2d6e6ea
--- /dev/null
+++ b/tests/test_data_html.py
@@ -0,0 +1,449 @@
+"""Tests for projspec.content.data_html — repr_text and repr_html.
+
+These tests use a mock DataResource to avoid needing real data files on disk
+for basic formatting checks, then run format-specific loader tests when the
+required optional libraries are available.
+"""
+
+from __future__ import annotations
+
+import io
+import os
+import tempfile
+from unittest.mock import MagicMock
+
+import pytest
+
+import projspec
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _make_dr(
+    path="mytable.parquet",
+    fmt="parquet",
+    modality="tabular",
+    layout="flat",
+    file_count=3,
+    total_size=1024 * 512,
+    schema=None,
+    sample_path="",
+    metadata=None,
+):
+    """Build a DataResource backed by a real Project (the repo root) but with
+    controlled field values."""
+    from projspec.content.data import DataResource
+
+    mock_proj = MagicMock(spec=projspec.Project)
+    # Use a real local filesystem via fsspec
+    import fsspec
+
+    mock_proj.fs = fsspec.filesystem("file")
+    mock_proj.url = "/tmp"
+
+    return DataResource(
+        proj=mock_proj,
+        path=path,
+        format=fmt,
+        modality=modality,
+        layout=layout,
+        file_count=file_count,
+        total_size=total_size,
+        schema=schema or {},
+        sample_path=sample_path,
+        metadata=metadata or {},
+    )
+
+
+# ---------------------------------------------------------------------------
+# repr_text tests
+# ---------------------------------------------------------------------------
+
+
+class TestReprText:
+    def test_basic_fields_present(self):
+        dr = _make_dr()
+        text = repr(dr)
+        assert "mytable.parquet" in text
+        assert "parquet" in text
+        assert "tabular" in text
+        assert "files=3" in text
+
+    def test_size_formatting(self):
+        dr = _make_dr(total_size=1024)
+        text = repr(dr)
+        assert "KB" in text or "B" in text
+
+    def test_size_zero(self):
+        dr = _make_dr(total_size=0)
+        text = repr(dr)
+        assert "unknown" in text
+
+    def test_schema_hint_dict(self):
+        dr = _make_dr(schema={"col_a": "int64", "col_b": "float32", "col_c": "str"})
+        text = repr(dr)
+        assert "col_a" in text
+
+    def test_schema_hint_many_fields(self):
+        schema = {f"col_{i}": "int64" for i in range(10)}
+        dr = _make_dr(schema=schema)
+        text = repr(dr)
+        assert "+7 more" in text
+
+    def test_schema_hint_list(self):
+        dr = _make_dr(schema=[{"name": "a"}, {"name": "b"}])
+        text = repr(dr)
+        assert "2 fields" in text
+
+    def test_non_flat_layout_shown(self):
+        dr = _make_dr(layout="hive")
+        text = repr(dr)
+        assert "hive" in text
+
+    def test_flat_layout_hidden(self):
+        dr = _make_dr(layout="flat")
+        text = repr(dr)
+        assert "layout" not in text
+
+    def test_no_modality(self):
+        dr = _make_dr(modality="")
+        text = repr(dr)
+        assert "modality" not in text
+
+    def test_single_line(self):
+        dr = _make_dr()
+        text = repr(dr)
+        assert "\n" not in text
+
+    def test_path_shown(self):
+        """repr_text must show the path field, not a separate name."""
+        dr = _make_dr(path="part*.csv")
+        text = repr(dr)
+        assert "part*.csv" in text
+
+    def test_dir_path_shown(self):
+        dr = _make_dr(path="year=2024/")
+        text = repr(dr)
+        assert "year=2024/" in text
+
+
+# ---------------------------------------------------------------------------
+# repr_html tests
+# ---------------------------------------------------------------------------
+
+
+class TestReprHtml:
+    def test_returns_string(self):
+        dr = _make_dr()
+        html = dr._repr_html_()
+        assert isinstance(html, str)
+        assert len(html) > 0
+
+    def test_contains_path(self):
+        dr = _make_dr(path="my_dataset.parquet")
+        html = dr._repr_html_()
+        assert "my_dataset.parquet" in html
+
+    def test_contains_glob_path(self):
+        dr = _make_dr(path="part*.parquet")
+        html = dr._repr_html_()
+        assert "part*.parquet" in html
+
+    def test_contains_dir_path(self):
+        dr = _make_dr(path="year=2024/")
+        html = dr._repr_html_()
+        assert "year=2024/" in html
+
+    def test_contains_format_badge(self):
+        dr = _make_dr(fmt="parquet")
+        html = dr._repr_html_()
+        assert "parquet" in html
+
+    def test_contains_modality_badge(self):
+        dr = _make_dr(modality="tabular")
+        html = dr._repr_html_()
+        assert "tabular" in html
+
+    def test_contains_file_count(self):
+        dr = _make_dr(file_count=7)
+        html = dr._repr_html_()
+        assert "7" in html
+
+    def test_contains_size(self):
+        dr = _make_dr(total_size=2048)
+        html = dr._repr_html_()
+        assert "KB" in html or "B" in html
+
+    def test_schema_dict_rendered(self):
+        dr = _make_dr(schema={"id": "int64", "name": "string"})
+        html = dr._repr_html_()
+        assert "id" in html
+        assert "int64" in html
+
+    def test_schema_list_of_dicts_rendered(self):
+        dr = _make_dr(
+            schema=[
+                {"name": "id", "type": "integer"},
+                {"name": "val", "type": "number"},
+            ]
+        )
+        html = dr._repr_html_()
+        assert "id" in html
+        assert "integer" in html
+
+    def test_schema_empty_no_details(self):
+        dr = _make_dr(schema={})
+        html = dr._repr_html_()
+        assert "Schema" not in html
+
+    def test_no_preview_section_without_sample_path(self):
+        dr = _make_dr(sample_path="")
+        html = dr._repr_html_()
+        assert "Preview" not in html
+
+    def test_layout_badge_shown_for_hive(self):
+        dr = _make_dr(layout="hive")
+        html = dr._repr_html_()
+        assert "hive" in html
+
+    def test_layout_badge_hidden_for_flat(self):
+        dr = _make_dr(layout="flat")
+        html = dr._repr_html_()
+        assert 'ps-badge-gray">flat<' not in html
+
+    def test_html_structure(self):
+        dr = _make_dr()
+        html = dr._repr_html_()
+        assert "ps-data-card" in html
+        assert "ps-data-card-header" in html
+        assert "ps-data-meta" in html
+
+    def test_icon_present_for_known_modality(self):
+        dr = _make_dr(modality="image")
+        html = dr._repr_html_()
+        # Image icon is 🖼 (&#x1F5BC;)
+        assert "&#x1F5BC;" in html
+
+    def test_icon_fallback_for_unknown_modality(self):
+        dr = _make_dr(modality="")
+        html = dr._repr_html_()
+        # Fallback icon &#x1F5C2;
+        assert "&#x1F5C2;" in html
+
+    def test_large_schema_collapsed(self):
+        schema = {f"col_{i}": "int64" for i in range(20)}
+        dr = _make_dr(schema=schema)
+        html = dr._repr_html_()
+        # details element should NOT have open attribute when >8 fields
+        assert (
+            "<details  style" in html
+            or 'details  style="margin-top:6px"' in html
+            or 'details style="margin-top:6px">' in html
+        )
+
+    def test_small_schema_open(self):
+        schema = {f"col_{i}": "int64" for i in range(4)}
+        dr = _make_dr(schema=schema)
+        html = dr._repr_html_()
+        assert "<details open" in html
+
+
+# ---------------------------------------------------------------------------
+# Live preview tests — skipped when optional dependencies are absent
+# ---------------------------------------------------------------------------
+
+
+class TestLivePreviews:
+    """Tests that write real files and verify the preview HTML is produced."""
+
+    def _dr_for_file(self, path, fmt, modality):
+        """Create a DataResource pointing at a real local file."""
+        from projspec.content.data import DataResource
+        import fsspec
+
+        mock_proj = MagicMock(spec=projspec.Project)
+        mock_proj.fs = fsspec.filesystem("file")
+        mock_proj.url = os.path.dirname(path)
+        return DataResource(
+            proj=mock_proj,
+            path=os.path.basename(path),
+            format=fmt,
+            modality=modality,
+            layout="flat",
+            file_count=1,
+            total_size=os.path.getsize(path),
+            schema={},
+            sample_path=path,
+        )
+
+    def test_csv_preview(self, tmp_path):
+        pd = pytest.importorskip("pandas")
+        import pandas as pd
+
+        path = str(tmp_path / "data.csv")
+        pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}).to_csv(path, index=False)
+        dr = self._dr_for_file(path, "csv", "tabular")
+        html = dr._repr_html_()
+        assert "Preview" in html
+        assert "<table" in html
+        assert "x" in html
+        assert "y" in html
+
+    def test_csv_preview_uses_repr_html(self, tmp_path):
+        """Preview HTML should come from pandas html, not hand-rolled."""
+        pytest.importorskip("pandas")
+        import pandas as pd
+
+        path = str(tmp_path / "data.csv")
+        pd.DataFrame({"x": range(20), "y": range(20)}).to_csv(path, index=False)
+        dr = self._dr_for_file(path, "csv", "tabular")
+        html = dr._repr_html_()
+        # pandas wraps its table in a <div> with a dataframe class
+        assert "dataframe" in html or "ps-df-wrap" in html
+
+    def test_csv_preview_row_limit(self, tmp_path):
+        """Only _PREVIEW_ROWS rows of data should appear, not all 50."""
+        pytest.importorskip("pandas")
+        import pandas as pd
+
+        path = str(tmp_path / "big.csv")
+        pd.DataFrame({"v": range(50)}).to_csv(path, index=False)
+        dr = self._dr_for_file(path, "csv", "tabular")
+        html = dr._repr_html_()
+        # Extract just the preview section so CSS text doesn't interfere
+        preview_start = html.find('<div class="ps-preview">')
+        assert preview_start != -1, "no preview section found"
+        preview_html = html[preview_start:]
+        # The last row value (49) should not appear as a table cell
+        assert "<td>49</td>" not in preview_html
+
+    def test_parquet_preview(self, tmp_path):
+        pytest.importorskip("pyarrow")
+        import pyarrow as pa
+        import pyarrow.parquet as pq
+
+        path = str(tmp_path / "data.parquet")
+        table = pa.table({"a": [1, 2, 3], "b": ["x", "y", "z"]})
+        pq.write_table(table, path)
+        dr = self._dr_for_file(path, "parquet", "tabular")
+        html = dr._repr_html_()
+        assert "Preview" in html
+        assert "<table" in html
+        assert "a" in html
+
+    def test_parquet_preview_uses_pandas_repr(self, tmp_path):
+        """Parquet preview must go through pandas html, not raw arrow HTML."""
+        pytest.importorskip("pyarrow")
+        import pyarrow as pa
+        import pyarrow.parquet as pq
+
+        path = str(tmp_path / "data.parquet")
+        table = pa.table({"col_a": range(10), "col_b": list("abcdefghij")})
+        pq.write_table(table, path)
+        dr = self._dr_for_file(path, "parquet", "tabular")
+        html = dr._repr_html_()
+        # pandas DataFrame.html includes class="dataframe"
+        assert "dataframe" in html
+
+    def test_parquet_preview_row_limit(self, tmp_path):
+        """Parquet preview reads only one row group and slices to _PREVIEW_ROWS."""
+        pytest.importorskip("pyarrow")
+        import pyarrow as pa
+        import pyarrow.parquet as pq
+        from projspec.content.data_html import _PREVIEW_ROWS
+
+        n_rows = 100
+        path = str(tmp_path / "large.parquet")
+        # Use a column whose values are unique strings unlikely to appear in CSS
+        values = [f"row_{i:04d}" for i in range(n_rows)]
+        pq.write_table(pa.table({"label": values}), path)
+        dr = self._dr_for_file(path, "parquet", "tabular")
+        html = dr._repr_html_()
+        assert "row_0000" in html  # first row present
+        assert "row_0099" not in html  # last row absent
+
+    def test_arrow_ipc_preview(self, tmp_path):
+        """Arrow IPC file: reads only the first batch, converts via pandas."""
+        pytest.importorskip("pyarrow")
+        import pyarrow as pa
+        import pyarrow.ipc as ipc
+
+        path = str(tmp_path / "data.arrow")
+        table = pa.table({"x": [10, 20, 30], "y": ["a", "b", "c"]})
+        with pa.OSFile(path, "wb") as sink:
+            with ipc.new_file(sink, table.schema) as writer:
+                writer.write_table(table)
+        dr = self._dr_for_file(path, "arrow", "tabular")
+        html = dr._repr_html_()
+        assert "Preview" in html
+        assert "dataframe" in html
+        assert "x" in html
+
+    def test_image_preview(self, tmp_path):
+        pytest.importorskip("PIL")
+        from PIL import Image
+
+        path = str(tmp_path / "test.png")
+        img = Image.new("RGB", (64, 64), color=(128, 0, 200))
+        img.save(path)
+        dr = self._dr_for_file(path, "png", "image")
+        html = dr._repr_html_()
+        assert "Preview" in html
+        assert "data:image/png;base64," in html
+
+    def test_numpy_preview(self, tmp_path):
+        np = pytest.importorskip("numpy")
+        import numpy as np
+
+        path = str(tmp_path / "arr.npy")
+        np.save(path, np.arange(20).reshape(4, 5))
+        dr = self._dr_for_file(path, "numpy", "array")
+        html = dr._repr_html_()
+        assert "Preview" in html
+        assert "shape" in html
+
+    def test_numpy_preview_reads_header_shape(self, tmp_path):
+        """The shape reported in the preview must match the actual array shape."""
+        np = pytest.importorskip("numpy")
+        import numpy as np
+
+        path = str(tmp_path / "arr.npy")
+        arr = np.zeros((7, 3), dtype="float32")
+        np.save(path, arr)
+        dr = self._dr_for_file(path, "numpy", "array")
+        html = dr._repr_html_()
+        assert "(7, 3)" in html
+        assert "float32" in html
+
+    def test_numpy_large_array_no_full_load(self, tmp_path):
+        """Arrays above the 1 MB threshold should show shape/dtype without a data slice."""
+        np = pytest.importorskip("numpy")
+        import numpy as np
+
+        path = str(tmp_path / "big.npy")
+        # 512 * 512 * float64 = 2 MB > 1 MB threshold
+        np.save(path, np.zeros((512, 512), dtype="float64"))
+        dr = self._dr_for_file(path, "numpy", "array")
+        html = dr._repr_html_()
+        assert "(512, 512)" in html  # shape shown
+        assert "float64" in html  # dtype shown
+        # The data slice key ("preview") should NOT appear in the info table;
+        # check the table cell content rather than the CSS class names
+        assert ">preview<" not in html  # no <td>preview</td> row
+
+
+# ---------------------------------------------------------------------------
+# fmt_size helper
+# ---------------------------------------------------------------------------
+
+
+def test_fmt_size():
+    from projspec.content.data_html import _fmt_size
+
+    assert _fmt_size(0) == "unknown"
+    assert _fmt_size(512) == "512 B"
+    assert "KB" in _fmt_size(2048)
+    assert "MB" in _fmt_size(2 * 1024 * 1024)
+    assert "GB" in _fmt_size(3 * 1024**3)
diff --git a/tests/test_data_project.py b/tests/test_data_project.py
new file mode 100644
index 0000000..9f71ff0
--- /dev/null
+++ b/tests/test_data_project.py
@@ -0,0 +1,362 @@
+import json
+import os
+
+import pytest
+
+import projspec
+from projspec.content.data import DataResource
+from projspec.utils import from_dict
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _data_project(tmp_path):
+    """Return a projspec.Project rooted at *tmp_path* (no walk needed)."""
+    return projspec.Project(str(tmp_path))
+
+
+# ---------------------------------------------------------------------------
+# Detection tests
+# ---------------------------------------------------------------------------
+
+
+class TestDataDetection:
+    def test_csv_detected(self, tmp_path):
+        (tmp_path / "data.csv").write_text("x,y\n1,2\n3,4\n")
+        proj = _data_project(tmp_path)
+        assert "data" in proj.specs
+
+    def test_parquet_detected(self, tmp_path):
+        pytest.importorskip("pyarrow")
+        import pyarrow as pa
+        import pyarrow.parquet as pq
+
+        pq.write_table(pa.table({"a": [1, 2]}), str(tmp_path / "t.parquet"))
+        proj = _data_project(tmp_path)
+        assert "data" in proj.specs
+
+    def test_no_data_files_not_detected(self, tmp_path):
+        (tmp_path / "README.md").write_text("hello")
+        (tmp_path / "config.json").write_text("{}")
+        proj = _data_project(tmp_path)
+        assert "data" not in proj.specs
+
+
+# ---------------------------------------------------------------------------
+# Parse / DataResource field tests
+# ---------------------------------------------------------------------------
+
+
+class TestDataParse:
+    def test_single_csv_resource(self, tmp_path):
+        (tmp_path / "sales.csv").write_text("col1,col2\n1,a\n2,b\n")
+        proj = _data_project(tmp_path)
+        dr = proj.specs["data"].contents["data_resource"]
+        assert isinstance(dr, DataResource)
+        assert dr.path == "sales.csv"
+        assert dr.format == "csv"
+        assert dr.modality == "tabular"
+        assert dr.file_count == 1
+
+    def test_series_collated_to_glob_path(self, tmp_path):
+        """part0.csv + part1.csv → path == 'part*.csv'"""
+        for i in range(3):
+            (tmp_path / f"part{i}.csv").write_text("x\n1\n")
+        proj = _data_project(tmp_path)
+        dr = proj.specs["data"].contents["data_resource"]
+        assert isinstance(dr, DataResource)
+        assert dr.path == "part*.csv"
+        assert dr.file_count == 3
+
+    def test_distinct_csv_files_separate_resources(self, tmp_path):
+        """users.csv and orders.csv differ alphabetically → two resources."""
+        (tmp_path / "users.csv").write_text("id\n1\n")
+        (tmp_path / "orders.csv").write_text("id\n1\n")
+        proj = _data_project(tmp_path)
+        dr_map = proj.specs["data"].contents["data_resource"]
+        # Two separate DataResource objects, keyed in an AttrDict
+        assert len(dr_map) == 2
+        paths = {dr_map[k].path for k in dr_map}
+        assert "users.csv" in paths
+        assert "orders.csv" in paths
+
+    def test_sample_path_is_full_path(self, tmp_path):
+        csv = tmp_path / "data.csv"
+        csv.write_text("x\n1\n")
+        proj = _data_project(tmp_path)
+        dr = proj.specs["data"].contents["data_resource"]
+        assert dr.sample_path == str(csv)
+
+    def test_total_size_nonzero(self, tmp_path):
+        content = "x,y\n" + "\n".join(f"{i},{i}" for i in range(20))
+        (tmp_path / "nums.csv").write_text(content)
+        proj = _data_project(tmp_path)
+        dr = proj.specs["data"].contents["data_resource"]
+        assert dr.total_size > 0
+
+
+# ---------------------------------------------------------------------------
+# Serialisation: to_dict
+# ---------------------------------------------------------------------------
+
+
+class TestDataResourceToDict:
+    def _make_dr(self, tmp_path):
+        (tmp_path / "items.csv").write_text("id,val\n1,a\n2,b\n")
+        proj = _data_project(tmp_path)
+        return proj.specs["data"].contents["data_resource"]
+
+    def test_compact_omits_klass(self, tmp_path):
+        dr = self._make_dr(tmp_path)
+        d = dr.to_dict(compact=True)
+        assert "klass" not in d
+
+    def test_compact_omits_html(self, tmp_path):
+        """compact=True is for human/console output — _html must be absent."""
+        dr = self._make_dr(tmp_path)
+        d = dr.to_dict(compact=True)
+        assert "_html" not in d
+
+
+# ---------------------------------------------------------------------------
+# Serialisation: from_dict round-trip
+# ---------------------------------------------------------------------------
+
+
+class TestDataResourceRoundTrip:
+    def _roundtrip(self, dr):
+        """Serialise to JSON and rehydrate, returning the new DataResource."""
+        d = dr.to_dict(compact=False)
+        js = json.dumps(d)
+        d2 = json.loads(js)
+        return from_dict(d2, proj=dr.proj)
+
+    def _make_dr(self, tmp_path):
+        (tmp_path / "orders.csv").write_text("order_id,amount\n1,99\n2,42\n")
+        proj = _data_project(tmp_path)
+        return proj.specs["data"].contents["data_resource"]
+
+    def test_roundtrip_returns_dataresource(self, tmp_path):
+        dr2 = self._roundtrip(self._make_dr(tmp_path))
+        assert isinstance(dr2, DataResource)
+
+    def test_roundtrip_preserves_path(self, tmp_path):
+        dr2 = self._roundtrip(self._make_dr(tmp_path))
+        assert dr2.path == "orders.csv"
+
+    def test_roundtrip_preserves_format(self, tmp_path):
+        dr2 = self._roundtrip(self._make_dr(tmp_path))
+        assert dr2.format == "csv"
+
+    def test_roundtrip_preserves_modality(self, tmp_path):
+        dr2 = self._roundtrip(self._make_dr(tmp_path))
+        assert dr2.modality == "tabular"
+
+    def test_roundtrip_preserves_file_count(self, tmp_path):
+        dr2 = self._roundtrip(self._make_dr(tmp_path))
+        assert dr2.file_count == 1
+
+    def test_roundtrip_preserves_total_size(self, tmp_path):
+        dr = self._make_dr(tmp_path)
+        dr2 = self._roundtrip(dr)
+        assert dr2.total_size == dr.total_size
+
+    def test_roundtrip_preserves_schema(self, tmp_path):
+        pytest.importorskip("pyarrow")
+        import pyarrow as pa, pyarrow.parquet as pq
+
+        pq.write_table(
+            pa.table({"col_a": [1, 2, 3], "col_b": ["x", "y", "z"]}),
+            str(tmp_path / "data.parquet"),
+        )
+        proj = _data_project(tmp_path)
+        dr = proj.specs["data"].contents["data_resource"]
+        dr2 = self._roundtrip(dr)
+        assert dr2.schema == dr.schema
+
+    def test_roundtrip_html_matches_original(self, tmp_path):
+        """_repr_html_() on the rehydrated object must equal the original render."""
+        dr = self._make_dr(tmp_path)
+        html_original = dr._repr_html_()
+        dr2 = self._roundtrip(dr)
+        assert dr2._repr_html_() == html_original
+
+    def test_roundtrip_html_cached_without_rerender(self, tmp_path):
+        """After from_dict the HTML is already in _html — no re-render occurs."""
+        dr = self._make_dr(tmp_path)
+        html_original = dr._repr_html_()
+        d = dr.to_dict(compact=False)
+        d2 = json.loads(json.dumps(d))
+        dr2 = from_dict(d2, proj=dr.proj)
+
+        # Confirm _html is set directly on the instance (not via lazy render)
+        assert (
+            "_html" in dr2.__dict__
+        ), "_html should be in instance __dict__ after from_dict"
+        assert dr2.__dict__["_html"] == html_original
+
+    def test_roundtrip_html_survives_missing_sample_path(self, tmp_path):
+        """After rehydration, _repr_html_() must work even if sample_path
+        no longer resolves (e.g. moved to a different machine)."""
+        dr = self._make_dr(tmp_path)
+        # Trigger render with a real file, then remove the file
+        html_original = dr._repr_html_()
+        os.remove(dr.sample_path)
+
+        dr2 = self._roundtrip(dr)
+        # sample_path is gone — but HTML was cached in the dict
+        assert dr2._repr_html_() == html_original
+
+
+# ---------------------------------------------------------------------------
+# Conditional parse: sentinel / byte-majority logic
+# ---------------------------------------------------------------------------
+
+
+class TestDataConditionalParse:
+    """Tests for the 'other project types present' guard in Data.parse()."""
+
+    # -- helpers --
+
+    def _big_csv(self, path, rows=500):
+        """Write a CSV large enough to dominate byte counts."""
+        content = "id,value\n" + "\n".join(f"{i},{i * 2}" for i in range(rows))
+        path.write_text(content)
+
+    # -- pure data directories (no sentinels) --
+
+    def test_pure_data_dir_no_sentinel(self, tmp_path):
+        """No sentinel → Data always parsed regardless of byte ratios."""
+        (tmp_path / "data.csv").write_text("x\n1\n")
+        proj = _data_project(tmp_path)
+        assert "data" in proj.specs
+
+    def test_datapackage_companion_not_a_sentinel(self, tmp_path):
+        """datapackage.json is a compatible companion — not a sentinel."""
+        self._big_csv(tmp_path / "data.csv")
+        (tmp_path / "datapackage.json").write_text('{"resources": []}')
+        proj = _data_project(tmp_path)
+        assert "data" in proj.specs
+
+    def test_dvc_companion_not_a_sentinel(self, tmp_path):
+        """catalog.yaml (IntakeCatalog / DVCRepo companion) is not a sentinel."""
+        self._big_csv(tmp_path / "data.csv")
+        (tmp_path / "catalog.yaml").write_text("sources: {}")
+        proj = _data_project(tmp_path)
+        assert "data" in proj.specs
+
+    # -- mixed dirs where data dominates --
+
+    def test_sentinel_present_data_majority(self, tmp_path):
+        """Sentinel present but data files are majority of bytes → Data parsed."""
+        self._big_csv(tmp_path / "data.csv")  # large data file
+        (tmp_path / "pyproject.toml").write_text(
+            "[project]\nname='x'\n"
+        )  # tiny sentinel
+        proj = _data_project(tmp_path)
+        assert "data" in proj.specs
+
+    def test_sentinel_present_data_majority_parquet(self, tmp_path):
+        pytest.importorskip("pyarrow")
+        import pyarrow as pa, pyarrow.parquet as pq
+
+        pq.write_table(
+            pa.table({"x": list(range(1000)), "y": list(range(1000))}),
+            str(tmp_path / "data.parquet"),
+        )
+        (tmp_path / "Cargo.toml").write_text('[package]\nname="x"\n')
+        proj = _data_project(tmp_path)
+        assert "data" in proj.specs
+
+    # -- mixed dirs where non-data dominates --
+
+    def test_sentinel_present_code_majority(self, tmp_path):
+        """Sentinel present and code files dominate → Data spec suppressed."""
+        # Large Python source file
+        (tmp_path / "main.py").write_text("x = 1\n" * 5000)
+        # Tiny CSV
+        (tmp_path / "tiny.csv").write_text("a,b\n1,2\n")
+        (tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n")
+        proj = _data_project(tmp_path)
+        assert "data" not in proj.specs
+
+    def test_sentinel_present_equal_split_not_majority(self, tmp_path):
+        """Exactly 50/50 bytes is not a majority — Data suppressed."""
+        payload = "x" * 1000
+        (tmp_path / "code.py").write_text(payload)
+        (tmp_path / "data.csv").write_text(payload)
+        (tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n")
+        proj = _data_project(tmp_path)
+        assert "data" not in proj.specs
+
+    # -- helpers / unit tests for the private methods --
+
+    def test_has_non_data_sentinels_true(self, tmp_path):
+        from projspec.proj.data_dir import Data
+
+        (tmp_path / "data.csv").write_text("x\n1\n")
+        (tmp_path / "pyproject.toml").write_text("")
+        proj = projspec.Project.__new__(projspec.Project)
+        import fsspec
+
+        proj.fs = fsspec.filesystem("file")
+        proj.url = str(tmp_path)
+        proj.__dict__["basenames"] = {
+            e["name"].rsplit("/", 1)[-1]: e["name"]
+            for e in proj.fs.ls(str(tmp_path), detail=True)
+        }
+        proj.__dict__["filelist"] = proj.fs.ls(str(tmp_path), detail=True)
+        inst = Data.__new__(Data)
+        inst.proj = proj
+        assert inst._has_non_data_sentinels() is True
+
+    def test_has_non_data_sentinels_false(self, tmp_path):
+        from projspec.proj.data_dir import Data
+
+        (tmp_path / "data.csv").write_text("x\n1\n")
+        proj = projspec.Project.__new__(projspec.Project)
+        import fsspec
+
+        proj.fs = fsspec.filesystem("file")
+        proj.url = str(tmp_path)
+        proj.__dict__["basenames"] = {
+            e["name"].rsplit("/", 1)[-1]: e["name"]
+            for e in proj.fs.ls(str(tmp_path), detail=True)
+        }
+        proj.__dict__["filelist"] = proj.fs.ls(str(tmp_path), detail=True)
+        inst = Data.__new__(Data)
+        inst.proj = proj
+        assert inst._has_non_data_sentinels() is False
+
+    def test_data_bytes_majority_true(self, tmp_path):
+        from projspec.proj.data_dir import Data
+
+        self._big_csv(tmp_path / "data.csv")
+        (tmp_path / "small.py").write_text("x = 1\n")
+        proj = projspec.Project.__new__(projspec.Project)
+        import fsspec
+
+        proj.fs = fsspec.filesystem("file")
+        proj.url = str(tmp_path)
+        proj.__dict__["filelist"] = proj.fs.ls(str(tmp_path), detail=True)
+        inst = Data.__new__(Data)
+        inst.proj = proj
+        assert inst._data_bytes_majority() is True
+
+    def test_data_bytes_majority_false(self, tmp_path):
+        from projspec.proj.data_dir import Data
+
+        (tmp_path / "main.py").write_text("x = 1\n" * 5000)
+        (tmp_path / "tiny.csv").write_text("a\n1\n")
+        proj = projspec.Project.__new__(projspec.Project)
+        import fsspec
+
+        proj.fs = fsspec.filesystem("file")
+        proj.url = str(tmp_path)
+        proj.__dict__["filelist"] = proj.fs.ls(str(tmp_path), detail=True)
+        inst = Data.__new__(Data)
+        inst.proj = proj
+        assert inst._data_bytes_majority() is False
diff --git a/vsextension/src/extension.ts b/vsextension/src/extension.ts
index 13ead47..12b3476 100644
--- a/vsextension/src/extension.ts
+++ b/vsextension/src/extension.ts
@@ -406,7 +406,7 @@ function getDetailsWebviewContent(projectBasename: string, projectUrl: string, p
 	const infoData = getInfoData();
 
 	// Keys that are internal implementation details and add no user-facing value
-	const SKIP_KEYS = new Set(['klass', 'proc', 'storage_options', 'children', 'url']);
+	const SKIP_KEYS = new Set(['klass', 'proc', 'storage_options', 'children', 'url', '_html']);
 
 	// Classify what type of colour-coding a node should get based on where it sits in the tree
 	type NodeRole = 'spec' | 'content' | 'artifact' | 'field' | 'none';
@@ -422,6 +422,8 @@ function getDetailsWebviewContent(projectBasename: string, projectUrl: string, p
 		// For info popups:
 		infoData?: string | null;
 		itemType?: string;
+		// Pre-rendered HTML from a content object's _html field
+		htmlContent?: string;
 	}
 
 	function escapeHtml(s: string): string {
@@ -572,6 +574,9 @@ function getDetailsWebviewContent(projectBasename: string, projectUrl: string, p
 				children: children.length > 0 ? children : undefined,
 				infoData: nodeInfoData,
 				itemType: role !== 'none' && role !== 'field' ? role : undefined,
+				htmlContent: (role === 'content' && value && typeof value === 'object' && !Array.isArray(value) && typeof (value as any)._html === 'string')
+					? (value as any)._html
+					: undefined,
 			});
 		}
 
@@ -620,6 +625,43 @@ function getDetailsWebviewContent(projectBasename: string, projectUrl: string, p
 			? `<ul class="tree-children" data-depth="${depth + 1}">${node.children!.map(c => renderDetailNode(c, depth + 1)).join('')}</ul>`
 			: '';
 
+		const htmlPreview = node.htmlContent
+			? (() => {
+				const css = `<style>
+:root{--bg:#0d1117;--bg-hd:#161b22;--bg-alt:#111820;--grn:#39d353;--grn-d:#26a641;--grn-m:#196127;--bd:#21262d;--bd-d:#161b22;--fg:#c9d1d9;--fg-d:#8b949e;--bb:#1f6feb;--bg2:#30363d;--fn:ui-monospace,'Cascadia Code','Fira Mono',monospace}
+*{box-sizing:border-box}
+body{background:var(--bg);color:var(--fg);margin:0;font-family:var(--fn);font-size:12px}
+.ps-data-card{border:1px solid var(--bd);border-radius:6px;overflow:hidden;background:var(--bg);color:var(--fg)}
+.ps-data-card-header{background:var(--bg-hd);padding:7px 12px;display:flex;align-items:center;gap:8px;border-bottom:1px solid var(--bd)}
+.ps-data-card-header .ps-icon{font-size:16px}
+.ps-data-card-header .ps-name{font-weight:bold;font-size:13px;color:var(--grn)}
+.ps-data-card-header .ps-badge{background:var(--bb);color:#fff;border-radius:10px;padding:1px 7px;font-size:10px}
+.ps-data-card-header .ps-badge-gray{background:var(--bg2);color:var(--fg);border-radius:10px;padding:1px 7px;font-size:10px}
+.ps-data-meta{padding:8px 12px;border-bottom:1px solid var(--bd-d)}
+.ps-data-meta table{border-collapse:collapse;width:100%}
+.ps-data-meta td{padding:2px 8px 2px 0;vertical-align:top}
+.ps-data-meta td:first-child{color:var(--fg-d);white-space:nowrap;width:110px}
+details>summary{list-style:none;cursor:pointer;color:var(--grn-d);font-size:11px;margin-top:4px}
+details>summary::-webkit-details-marker{display:none}
+.ps-schema-table{font-size:11px;border-collapse:collapse;margin-top:4px;width:100%}
+.ps-schema-table th{background:var(--bg-hd);color:var(--grn-d);padding:2px 8px;text-align:left;border:1px solid var(--bd)}
+.ps-schema-table td{padding:2px 8px;border:1px solid var(--bd-d);font-family:var(--fn);color:var(--fg)}
+.ps-schema-table td strong{color:var(--grn)}
+.ps-preview{padding:8px 12px}
+.ps-preview-title{font-weight:bold;font-size:10px;color:var(--grn-m);margin-bottom:5px;text-transform:uppercase;letter-spacing:.8px}
+.ps-df-wrap{overflow-x:auto}
+.ps-df-wrap table,.dataframe{font-size:11px!important;border-collapse:collapse!important;width:100%!important;color:var(--fg)!important;background:var(--bg)!important}
+.ps-df-wrap th,.dataframe thead th{background:var(--bg-hd)!important;color:var(--grn-d)!important;padding:3px 10px!important;border:1px solid var(--bd)!important;white-space:nowrap;text-align:left!important}
+.ps-df-wrap td,.dataframe tbody td{padding:2px 10px!important;border:1px solid var(--bd-d)!important;color:var(--fg)!important;background:var(--bg)!important;white-space:nowrap;max-width:200px;overflow:hidden;text-overflow:ellipsis}
+.dataframe tbody tr:nth-child(even) td{background:var(--bg-alt)!important}
+.ps-img-preview{max-width:100%;max-height:200px;border-radius:4px}
+</style>`;
+				const srcdoc = (css + node.htmlContent)
+					.replace(/&/g, '&amp;').replace(/"/g, '&quot;').replace(/</g, '&lt;').replace(/>/g, '&gt;');
+				return `<iframe class="html-preview" sandbox="allow-scripts" srcdoc="${srcdoc}"></iframe>`;
+			})()
+			: '';
+
 		return `<li class="tree-item">
 			<div class="${nodeClass}" data-item="${nodeData}">
 				<span class="${iconClass}"></span>
@@ -627,6 +669,7 @@ function getDetailsWebviewContent(projectBasename: string, projectUrl: string, p
 				${canMake ? `<button class="make-button" data-item="${nodeData}" title="Make artifact">Make</button>` : ''}
 				${hasInfoPopup ? `<button class="info-button" data-item="${nodeData}" title="Show information">i</button>` : ''}
 			</div>
+			${htmlPreview}
 			${childrenHtml}
 		</li>`;
 	}
@@ -726,6 +769,17 @@ function getDetailsWebviewContent(projectBasename: string, projectUrl: string, p
         .artifact-node { color: #ce9178; }
         .field-node { color: var(--vscode-foreground); }
 
+        .html-preview {
+            display: block;
+            width: 100%;
+            border: none;
+            margin-top: 4px;
+            margin-left: 20px;
+            /* height is set by the resize observer in JS */
+            min-height: 40px;
+            max-height: 600px;
+        }
+
         .info-button {
             width: 20px; height: 20px; border-radius: 50%;
             background: var(--vscode-button-background);