From 37f662769a9adb06e5ce2fad558c3291ac72512a Mon Sep 17 00:00:00 2001 From: prrao87 <35005448+prrao87@users.noreply.github.com> Date: Tue, 12 May 2026 10:28:46 -0700 Subject: [PATCH 1/3] docs: promote Lance datasets to a top-level Datasets tab with auto-sync from upstream cards Move Hugging Face datasets out of the Integrations > Hugging Face Hub subgroup into their own top-level Datasets tab between Demos and API Reference. Each dataset gets its own page, populated automatically from the HF_DATASET_CARD.md files in lance-format/lance-huggingface so the upstream repo remains the single source of truth. - huggingface/overview.mdx -> integrations/ai/huggingface.mdx (the integration walkthrough now lives next to the other AI-platform integration pages) - huggingface/datasets.mdx -> datasets/index.mdx (landing page + auto-generated card grid between HF_SYNC:START/HF_SYNC:END markers) - 30 per-dataset MDX pages under docs/datasets/, organized into 9 categories in the sidebar - scripts/hf_datasets.yaml: explicit mapping of upstream directory, URL slug, HF Hub repo, and display title for each dataset (the three names don't have a derivable relationship) - scripts/sync_hf_datasets.py: fetches each upstream card, rewrites frontmatter for Mintlify, strips the H1, injects a "View on Hugging Face" card, and sanitizes known MDX hazards (orphan bibtex citations, literal "<>" in prose) - Makefile: hf-sync target wires it up; pyproject adds pyyaml - Redirects in docs.json keep old /huggingface/* URLs working - README documents why the scripts exist and the maintainer workflow for adding a new dataset Co-Authored-By: Claude Opus 4.7 (1M context) --- Makefile | 9 +- README.md | 42 ++ docs/datasets/ade20k.mdx | 169 ++++++++ docs/datasets/chartqa.mdx | 114 ++++++ docs/datasets/cifar10.mdx | 174 ++++++++ docs/datasets/coco-captions-2017.mdx | 167 ++++++++ docs/datasets/coco-detection-2017.mdx | 202 +++++++++ docs/datasets/docvqa.mdx | 147 +++++++ docs/datasets/eurosat.mdx | 141 +++++++ docs/datasets/fashion-mnist.mdx | 158 ++++++++ docs/datasets/fineweb-edu.mdx | 246 +++++++++++ docs/datasets/flickr30k.mdx | 214 ++++++++++ docs/datasets/food101.mdx | 125 ++++++ docs/datasets/gqa-testdev-balanced.mdx | 153 +++++++ docs/datasets/hotpotqa-distractor.mdx | 163 ++++++++ docs/datasets/imagenet-1k-val.mdx | 163 ++++++++ docs/datasets/index.mdx | 182 +++++++++ docs/datasets/kitti-2d-detection.mdx | 190 +++++++++ docs/datasets/laion-1m.mdx | 284 +++++++++++++ docs/datasets/lerobot-pusht.mdx | 123 ++++++ docs/datasets/lerobot-xvla-soft-fold.mdx | 276 +++++++++++++ docs/datasets/librispeech-clean.mdx | 189 +++++++++ docs/datasets/mnist.mdx | 181 +++++++++ docs/datasets/ms-marco-v2.1.mdx | 184 +++++++++ docs/datasets/natural-questions-val.mdx | 156 +++++++ docs/datasets/openvid.mdx | 382 ++++++++++++++++++ docs/datasets/oxford-pets.mdx | 119 ++++++ .../datasets/pascal-voc-2012-segmentation.mdx | 143 +++++++ docs/datasets/squad-v2.mdx | 182 +++++++++ docs/datasets/stanford-cars.mdx | 123 ++++++ docs/datasets/textvqa.mdx | 148 +++++++ docs/datasets/trivia-qa.mdx | 166 ++++++++ docs/datasets/vqav2.mdx | 210 ++++++++++ docs/docs.json | 104 ++++- docs/huggingface/datasets.mdx | 179 -------- .../ai/huggingface.mdx} | 2 +- pyproject.toml | 1 + scripts/hf_datasets.yaml | 70 ++++ scripts/sync_hf_datasets.py | 377 +++++++++++++++++ uv.lock | 2 + 40 files changed, 6169 insertions(+), 191 deletions(-) create mode 100644 docs/datasets/ade20k.mdx create mode 100644 docs/datasets/chartqa.mdx create mode 100644 docs/datasets/cifar10.mdx create mode 100644 docs/datasets/coco-captions-2017.mdx create mode 100644 docs/datasets/coco-detection-2017.mdx create mode 100644 docs/datasets/docvqa.mdx create mode 100644 docs/datasets/eurosat.mdx create mode 100644 docs/datasets/fashion-mnist.mdx create mode 100644 docs/datasets/fineweb-edu.mdx create mode 100644 docs/datasets/flickr30k.mdx create mode 100644 docs/datasets/food101.mdx create mode 100644 docs/datasets/gqa-testdev-balanced.mdx create mode 100644 docs/datasets/hotpotqa-distractor.mdx create mode 100644 docs/datasets/imagenet-1k-val.mdx create mode 100644 docs/datasets/index.mdx create mode 100644 docs/datasets/kitti-2d-detection.mdx create mode 100644 docs/datasets/laion-1m.mdx create mode 100644 docs/datasets/lerobot-pusht.mdx create mode 100644 docs/datasets/lerobot-xvla-soft-fold.mdx create mode 100644 docs/datasets/librispeech-clean.mdx create mode 100644 docs/datasets/mnist.mdx create mode 100644 docs/datasets/ms-marco-v2.1.mdx create mode 100644 docs/datasets/natural-questions-val.mdx create mode 100644 docs/datasets/openvid.mdx create mode 100644 docs/datasets/oxford-pets.mdx create mode 100644 docs/datasets/pascal-voc-2012-segmentation.mdx create mode 100644 docs/datasets/squad-v2.mdx create mode 100644 docs/datasets/stanford-cars.mdx create mode 100644 docs/datasets/textvqa.mdx create mode 100644 docs/datasets/trivia-qa.mdx create mode 100644 docs/datasets/vqav2.mdx delete mode 100644 docs/huggingface/datasets.mdx rename docs/{huggingface/overview.mdx => integrations/ai/huggingface.mdx} (99%) create mode 100644 scripts/hf_datasets.yaml create mode 100644 scripts/sync_hf_datasets.py diff --git a/Makefile b/Makefile index e7305d25..05bb00c8 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,9 @@ # Paths SCRIPT := scripts/mdx_snippets_gen.py +HF_SYNC_SCRIPT := scripts/sync_hf_datasets.py # uv run automatically handles virtualenv, so no activation needed -.PHONY: py ts rs snippets +.PHONY: py ts rs snippets hf-sync # Generate Python MDX snippets py: @@ -19,3 +20,9 @@ rs: # Convenience: generate all snippets snippets: py ts rs +# Sync Lance dataset cards from lance-format/lance-huggingface into docs/datasets/. +# Regenerates per-dataset MDX pages, the landing-page card grid, and the +# Datasets tab in docs.json based on scripts/hf_datasets.yaml. +hf-sync: + @uv run $(HF_SYNC_SCRIPT) + diff --git a/README.md b/README.md index 8f65467b..8298c811 100644 --- a/README.md +++ b/README.md @@ -67,3 +67,45 @@ code that's been tested (per recent LanceDB releases) in the hands of users. > As far as possible, do not add code snippets manually inside triple-backticks! Write the tests for > the required language in `tests/*` directory, then generate the snippets programmatically via the Makefile > commands. + +## Sync Hugging Face dataset pages + +The `Datasets` tab is populated from [`lance-format/lance-huggingface`](https://github.com/lance-format/lance-huggingface), +the master repository where each Lance dataset published under the [`lance-format`](https://huggingface.co/lance-format) +Hugging Face organization has its own directory with an `HF_DATASET_CARD.md`. That same file is what gets pushed to +the Hub as the dataset's `README.md` via the `hf` CLI, so the GitHub repo is the single source of truth for the +content of every dataset card. + +To avoid maintaining the same content in two places, the per-dataset MDX pages under `docs/datasets/` are +generated from those upstream cards via `scripts/sync_hf_datasets.py`. The script: + +1. Reads `scripts/hf_datasets.yaml`, which lists every dataset to publish and maps the upstream directory name, + the URL slug, the HF Hub repo, and the human-readable title. +2. Fetches each `HF_DATASET_CARD.md` from `lance-format/lance-huggingface` on GitHub. +3. Rewrites the frontmatter for Mintlify (sets `title`, `sidebarTitle`, `description`), strips the upstream H1, + injects a "View on Hugging Face" card at the top, and sanitizes known MDX hazards (bibtex citations outside + code fences, literal `<>` in prose). +4. Writes `docs/datasets/.mdx`, regenerates the card grid in `docs/datasets/index.mdx` between the + `HF_SYNC:START` / `HF_SYNC:END` markers, and updates the `Datasets` tab in `docs/docs.json` to keep the + sidebar in sync. + +Run it from the repo root: + +```bash +make hf-sync +``` + +### Adding a new dataset + +1. Author the new dataset's `HF_DATASET_CARD.md` upstream in `lance-format/lance-huggingface` (and push it to the + Hub as usual). +2. Add a single line for the dataset under the appropriate category in `scripts/hf_datasets.yaml`. The four + fields (`dir`, `slug`, `hf`, `title`) are explicit because the GitHub directory name, the HF Hub repo slug, + and the desired URL slug don't follow a derivable convention. +3. Run `make hf-sync`. The script will fetch the new card, generate `docs/datasets/.mdx`, refresh the + landing-page card grid, and add the new page to the `Datasets` tab in `docs/docs.json`. +4. Preview locally with `mint dev` and commit the changes (the MDX page, the regenerated `index.mdx`, the + updated `docs.json`, and the new yaml entry). + +If you remove a dataset from the yaml, the next `make hf-sync` will delete its MDX file and drop the sidebar +entry. The script hard-fails on any fetch error — partial regeneration would be worse than a clear error. diff --git a/docs/datasets/ade20k.mdx b/docs/datasets/ade20k.mdx new file mode 100644 index 00000000..39f15587 --- /dev/null +++ b/docs/datasets/ade20k.mdx @@ -0,0 +1,169 @@ +--- +title: "ADE20K" +sidebarTitle: "ADE20K" +description: "Lance-formatted version of the full ADE20K scene parsing benchmark (sourced from 1aurent/ADE20K) — 27,574 scene images with semantic and instance segmentation maps, scene labels, and per-object metadata, all stored inline." +--- + + +Source dataset card and downloadable files for `lance-format/ade20k-lance`. + + +Lance-formatted version of the full [ADE20K scene parsing benchmark](https://groups.csail.mit.edu/vision/datasets/ADE20K/) (sourced from [`1aurent/ADE20K`](https://huggingface.co/datasets/1aurent/ADE20K)) — **27,574 scene images** with semantic and instance segmentation maps, scene labels, and per-object metadata, all stored inline. + +## Splits + +| Split | Rows | +|-------|------| +| `train.lance` | 25,574 | +| `validation.lance` | 2,000 | + +## Schema + +| Column | Type | Notes | +|---|---|---| +| `id` | `int64` | Row index within split | +| `image` | `large_binary` | Inline JPEG bytes | +| `segmentation` | `large_binary` | Inline PNG bytes — semantic segmentation map (RGB encoding per ADE20K spec) | +| `instance` | `large_binary?` | Inline PNG bytes — instance map; null if not provided | +| `filename` | `string` | ADE20K relative filename | +| `scene` | `list` | Scene labels (e.g. `["bathroom"]`) | +| `object_names` | `list` | Names of all annotated objects (one entry per polygon) | +| `objects_present` | `list` | Deduped object names — feeds the `LABEL_LIST` index | +| `num_objects` | `int32` | Number of annotated objects | +| `image_emb` | `fixed_size_list` | OpenCLIP `ViT-B-32` image embedding (cosine-normalized) | + +## Pre-built indices + +- `IVF_PQ` on `image_emb` — `metric=cosine` +- `BTREE` on `num_objects` +- `LABEL_LIST` on `objects_present` — supports `array_has_any` / `array_has_all` + +## Quick start + +```python +import lance + +ds = lance.dataset("hf://datasets/lance-format/ade20k-lance/data/validation.lance") +print(ds.count_rows(), ds.schema.names, ds.list_indices()) +``` + +## Load with LanceDB + +These tables can also be consumed by [LanceDB](https://lancedb.github.io/lancedb/), the multimodal lakehouse and embedded search library built on top of Lance, for simplified vector search and other queries. + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/ade20k-lance/data") +tbl = db.open_table("validation") +print(f"LanceDB table opened with {len(tbl)} scene images") +``` + +## Read an image with its segmentation + +```python +import io +import lance +from PIL import Image + +ds = lance.dataset("hf://datasets/lance-format/ade20k-lance/data/validation.lance") +row = ds.take([0], columns=["image", "segmentation", "scene", "objects_present"]).to_pylist()[0] + +Image.open(io.BytesIO(row["image"])).save("img.jpg") +Image.open(io.BytesIO(row["segmentation"])).save("seg.png") +print("scene:", row["scene"]) +print("objects:", row["objects_present"][:10]) +``` + +## Filter by scene / objects + +```python +import lance +ds = lance.dataset("hf://datasets/lance-format/ade20k-lance/data/validation.lance") + +# Indoor scenes containing both a bed and a window. +rows = ds.scanner( + filter="array_has_all(objects_present, ['bed', 'window'])", + columns=["filename", "scene"], + limit=10, +).to_table().to_pylist() +``` + +### Filter with LanceDB + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/ade20k-lance/data") +tbl = db.open_table("validation") + +rows = ( + tbl.search() + .where("array_has_all(objects_present, ['bed', 'window'])") + .select(["filename", "scene"]) + .limit(10) + .to_list() +) +``` + +## Visual similarity search + +```python +import lance +import pyarrow as pa + +ds = lance.dataset("hf://datasets/lance-format/ade20k-lance/data/validation.lance") +emb_field = ds.schema.field("image_emb") +ref = ds.take([0], columns=["image_emb"]).to_pylist()[0]["image_emb"] +query = pa.array([ref], type=emb_field.type) + +neighbors = ds.scanner( + nearest={"column": "image_emb", "q": query[0], "k": 5}, + columns=["filename", "scene"], +).to_table().to_pylist() +``` + +### LanceDB visual similarity search + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/ade20k-lance/data") +tbl = db.open_table("validation") + +ref = tbl.search().limit(1).select(["image_emb"]).to_list()[0] +query_embedding = ref["image_emb"] + +results = ( + tbl.search(query_embedding) + .metric("cosine") + .select(["filename", "scene"]) + .limit(5) + .to_list() +) +``` + +## Why Lance? + +- One dataset for images + segmentation + instance + scene + objects + embeddings + indices — no folder of paired files. +- On-disk vector and label-list indices live next to the data, so search works on local copies and on the Hub. +- Schema evolution: add columns (panoptic ids, fresh embeddings, model predictions) without rewriting the data. + +## Source & license + +Converted from [`1aurent/ADE20K`](https://huggingface.co/datasets/1aurent/ADE20K). ADE20K is released under the [BSD 3-Clause license](https://github.com/CSAILVision/ADE20K/blob/main/LICENSE) by the MIT CSAIL Computer Vision group. + +## Citation + +``` +@inproceedings{zhou2017scene, + title={Scene Parsing through ADE20K Dataset}, + author={Zhou, Bolei and Zhao, Hang and Puig, Xavier and Fidler, Sanja and Barriuso, Adela and Torralba, Antonio}, + booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + year={2017} +} +``` diff --git a/docs/datasets/chartqa.mdx b/docs/datasets/chartqa.mdx new file mode 100644 index 00000000..a7b7b3df --- /dev/null +++ b/docs/datasets/chartqa.mdx @@ -0,0 +1,114 @@ +--- +title: "ChartQA" +sidebarTitle: "ChartQA" +description: "Lance-formatted version of ChartQA — VQA over scientific and business charts that combine logical and visual reasoning — sourced from lmms-lab/ChartQA." +--- + + +Source dataset card and downloadable files for `lance-format/chartqa-lance`. + + +Lance-formatted version of [ChartQA](https://github.com/vis-nlp/ChartQA) — VQA over scientific and business charts that combine logical and visual reasoning — sourced from [`lmms-lab/ChartQA`](https://huggingface.co/datasets/lmms-lab/ChartQA). + +## Splits + +| Split | Rows | +|-------|------| +| `test.lance` | 2,500 | + +> The `lmms-lab/ChartQA` redistribution exposes test only. Train and validation live in the original release (https://github.com/vis-nlp/ChartQA); add them via `chartqa/dataprep.py --splits` once a parquet mirror is identified. + +## Schema + +| Column | Type | Notes | +|---|---|---| +| `id` | `int64` | Row index | +| `image` | `large_binary` | Inline chart image bytes | +| `image_id` / `question_id` | `string?` | (Source does not assign explicit ids — null for now) | +| `question` | `string` | Natural-language question | +| `answers` | `list` | Reference answer (typically a single string) | +| `answer` | `string` | First answer — used as canonical | +| `type` | `string?` | Question type (`human` vs `augmented`) | +| `image_emb` | `fixed_size_list` | CLIP image embedding (cosine-normalized) | +| `question_emb` | `fixed_size_list` | CLIP text embedding of the question | + +## Pre-built indices + +- `IVF_PQ` on `image_emb` and `question_emb` — `metric=cosine` +- `INVERTED` (FTS) on `question` and `answer` +- `BITMAP` on `type` + +## Quick start + +```python +import lance +ds = lance.dataset("hf://datasets/lance-format/chartqa-lance/data/test.lance") +print(ds.count_rows(), ds.schema.names, ds.list_indices()) +``` + +## Load with LanceDB + +These tables can also be consumed by [LanceDB](https://lancedb.github.io/lancedb/), the multimodal lakehouse and embedded search library built on top of Lance, for simplified vector search and other queries. + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/chartqa-lance/data") +tbl = db.open_table("test") +print(f"LanceDB table opened with {len(tbl)} chart-question pairs") +``` + +### LanceDB vector search + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/chartqa-lance/data") +tbl = db.open_table("test") + +ref = tbl.search().limit(1).select(["question_emb", "question"]).to_list()[0] +query_embedding = ref["question_emb"] + +results = ( + tbl.search(query_embedding, vector_column_name="question_emb") + .metric("cosine") + .select(["question", "answer"]) + .limit(5) + .to_list() +) +``` + +### LanceDB full-text search + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/chartqa-lance/data") +tbl = db.open_table("test") + +results = ( + tbl.search("percentage") + .select(["question", "answer"]) + .limit(10) + .to_list() +) +``` + +## Source & license + +Converted from [`lmms-lab/ChartQA`](https://huggingface.co/datasets/lmms-lab/ChartQA). The original ChartQA dataset is released under the GNU GPL-3.0 license by Masry et al. + +## Citation + +``` +@inproceedings{masry2022chartqa, + title={ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning}, + author={Masry, Ahmed and Long, Do Xuan and Tan, Jia Qing and Joty, Shafiq and Hoque, Enamul}, + booktitle={Findings of the Association for Computational Linguistics: ACL 2022}, + year={2022} +} +``` diff --git a/docs/datasets/cifar10.mdx b/docs/datasets/cifar10.mdx new file mode 100644 index 00000000..60e4313c --- /dev/null +++ b/docs/datasets/cifar10.mdx @@ -0,0 +1,174 @@ +--- +title: "CIFAR-10" +sidebarTitle: "CIFAR-10" +description: "A Lance-formatted version of CIFAR-10 with 60,000 32×32 RGB images across 10 classes, stored inline with CLIP embeddings and a pre-built IVF_PQ ANN index." +--- + + +Source dataset card and downloadable files for `lance-format/cifar10-lance`. + + +A Lance-formatted version of [CIFAR-10](https://huggingface.co/datasets/uoft-cs/cifar10) with **60,000 32×32 RGB images** across 10 classes, stored inline with CLIP embeddings and a pre-built `IVF_PQ` ANN index. + +## Key features + +- All multimodal data (image bytes + embeddings) stored **inline** in the same Lance dataset. +- **Pre-computed CLIP embeddings** (OpenCLIP `ViT-B-32` / `laion2b_s34b_b79k`, 512-dim, L2-normalized) with an `IVF_PQ` index. +- **BTREE on `label`** and **BITMAP on `label_name`** for fast filtered scans. + +## Splits + +| Split | Rows | +|-------|------| +| `train` | 50,000 | +| `test` | 10,000 | + +## Schema + +| Column | Type | Notes | +|---|---|---| +| `id` | `int64` | Row index within the split | +| `image` | `large_binary` | Inline PNG bytes (32×32 RGB) | +| `label` | `int32` | Class id (0-9) | +| `label_name` | `string` | One of `airplane`, `automobile`, `bird`, `cat`, `deer`, `dog`, `frog`, `horse`, `ship`, `truck` | +| `image_emb` | `fixed_size_list` | CLIP image embedding (cosine-normalized) | + +## Pre-built indices + +- `IVF_PQ` on `image_emb` — `metric=cosine` +- `BTREE` on `label` +- `BITMAP` on `label_name` + +## Load with `datasets.load_dataset` + +```python +import datasets + +hf_ds = datasets.load_dataset("lance-format/cifar10-lance", split="train", streaming=True) +for row in hf_ds.take(3): + print(row["label_name"]) +``` + +## Load directly with Lance (recommended) + +```python +import lance + +ds = lance.dataset("hf://datasets/lance-format/cifar10-lance/data/train.lance") +print(ds.count_rows(), ds.schema.names, ds.list_indices()) +``` + +## Load with LanceDB + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/cifar10-lance/data") +tbl = db.open_table("train") +print(len(tbl)) +``` + +> **Tip — for production use, download locally first.** +> ```bash +> hf download lance-format/cifar10-lance --repo-type dataset --local-dir ./cifar10-lance +> ``` + +## Vector search example + +```python +import lance +import pyarrow as pa + +ds = lance.dataset("hf://datasets/lance-format/cifar10-lance/data/train.lance") +emb_field = ds.schema.field("image_emb") +ref = ds.take([0], columns=["image_emb"]).to_pylist()[0]["image_emb"] +query = pa.array([ref], type=emb_field.type) + +neighbors = ds.scanner( + nearest={"column": "image_emb", "q": query[0], "k": 5, "nprobes": 16, "refine_factor": 30}, + columns=["id", "label_name"], +).to_table().to_pylist() +print(neighbors) +``` + +### LanceDB vector search + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/cifar10-lance/data") +tbl = db.open_table("train") + +ref = tbl.search().limit(1).select(["image_emb"]).to_list()[0] +query_embedding = ref["image_emb"] + +results = ( + tbl.search(query_embedding) + .metric("cosine") + .select(["id", "label_name"]) + .limit(5) + .to_list() +) +for row in results: + print(row["id"], row["label_name"]) +``` + +## Filter by class + +```python +import lance +ds = lance.dataset("hf://datasets/lance-format/cifar10-lance/data/train.lance") +ships = ds.scanner(filter="label_name = 'ship'", columns=["id"], limit=5).to_table() +``` + +### Filter by class with LanceDB + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/cifar10-lance/data") +tbl = db.open_table("train") +ships = ( + tbl.search() + .where("label_name = 'ship'") + .select(["id"]) + .limit(5) + .to_list() +) +``` + +## Working with images + +```python +from pathlib import Path +import lance + +ds = lance.dataset("hf://datasets/lance-format/cifar10-lance/data/train.lance") +row = ds.take([0], columns=["image", "label_name"]).to_pylist()[0] +Path(f"sample_{row['label_name']}.png").write_bytes(row["image"]) +``` + +## Why Lance? + +- One dataset for images + embeddings + indices + metadata — no sidecar files. +- On-disk vector and FTS indices live next to the data, so search works on both local copies and the Hub. +- Schema evolution: add new columns (model predictions, fresh embeddings, augmentations) without rewriting the data. + +## Source & license + +Converted from [`uoft-cs/cifar10`](https://huggingface.co/datasets/uoft-cs/cifar10). CIFAR-10 was collected by Alex Krizhevsky, Vinod Nair, and Geoffrey Hinton at the University of Toronto. + +## Citation + +``` +@techreport{krizhevsky2009cifar10, + title={Learning multiple layers of features from tiny images}, + author={Krizhevsky, Alex and Hinton, Geoffrey}, + year={2009}, + institution={University of Toronto} +} +``` diff --git a/docs/datasets/coco-captions-2017.mdx b/docs/datasets/coco-captions-2017.mdx new file mode 100644 index 00000000..ad629d5a --- /dev/null +++ b/docs/datasets/coco-captions-2017.mdx @@ -0,0 +1,167 @@ +--- +title: "COCO Captions 2017" +sidebarTitle: "COCO Captions 2017" +description: "Lance-formatted version of the COCO Captions 2017 corpus, redistributed via lmms-lab/COCO-Caption2017. Each row is one image with 5–7 human-written captions, CLIP image embedding, and CLIP text embedding of the canonical caption — all stored inline." +--- + + +Source dataset card and downloadable files for `lance-format/coco-captions-2017-lance`. + + +Lance-formatted version of the [COCO Captions 2017](https://cocodataset.org/) corpus, redistributed via [`lmms-lab/COCO-Caption2017`](https://huggingface.co/datasets/lmms-lab/COCO-Caption2017). Each row is one image with **5–7 human-written captions**, CLIP image embedding, and CLIP text embedding of the canonical caption — all stored inline. + +## Splits + +| Split | Rows | +|-------|------| +| `val.lance` | 5,000 (canonical COCO 2017 val set) | +| `test.lance` | 40,700 | + +> The 2017 train split (118 k images, ~18 GB of source JPEGs) is intentionally +> not bundled here because the `lmms-lab/COCO-Caption2017` redistribution does +> not include it. To extend with train, run `coco_captions_2017/dataprep.py` +> against your local COCO 2017 train mirror. + +## Schema + +| Column | Type | Notes | +|---|---|---| +| `id` | `int64` | Row index within split | +| `image` | `large_binary` | Inline JPEG bytes | +| `image_id` | `string` | COCO image id | +| `filename` | `string` | Original filename (e.g. `000000179765.jpg`) | +| `captions` | `list` | All 5–7 captions | +| `caption` | `string` | First caption — used as canonical text for FTS | +| `image_emb` | `fixed_size_list` | CLIP image embedding (cosine-normalized) | +| `text_emb` | `fixed_size_list` | CLIP text embedding of the canonical caption | + +## Pre-built indices + +- `IVF_PQ` on `image_emb` and `text_emb` — `metric=cosine` +- `INVERTED` on `caption` +- `BTREE` on `image_id` + +## Quick start + +```python +import lance + +ds = lance.dataset("hf://datasets/lance-format/coco-captions-2017-lance/data/val.lance") +print(ds.count_rows(), ds.schema.names) +print(ds.list_indices()) +``` + +## Load with LanceDB + +These tables can also be consumed by [LanceDB](https://lancedb.github.io/lancedb/), the multimodal lakehouse and embedded search library built on top of Lance, for simplified vector search and other queries. + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/coco-captions-2017-lance/data") +tbl = db.open_table("val") +print(f"LanceDB table opened with {len(tbl)} image-caption pairs") +``` + +> **Tip — for production use, download locally first.** +> ```bash +> hf download lance-format/coco-captions-2017-lance --repo-type dataset --local-dir ./coco-captions-2017-lance +> ``` + +## Vector search examples + +Cross-modal text→image: + +```python +import lance, open_clip, pyarrow as pa, torch + +model, _, _ = open_clip.create_model_and_transforms("ViT-B-32", pretrained="laion2b_s34b_b79k") +tokenizer = open_clip.get_tokenizer("ViT-B-32") +model = model.eval().cuda().half() +with torch.no_grad(): + q = model.encode_text(tokenizer(["a giraffe eating leaves"]).cuda()) + q = (q / q.norm(dim=-1, keepdim=True)).float().cpu().numpy()[0] + +ds = lance.dataset("hf://datasets/lance-format/coco-captions-2017-lance/data/val.lance") +emb_field = ds.schema.field("image_emb") +hits = ds.scanner( + nearest={"column": "image_emb", "q": pa.array([q.tolist()], type=emb_field.type)[0], "k": 10}, + columns=["image_id", "caption"], +).to_table().to_pylist() +``` + +### LanceDB cross-modal text→image search + +```python +import lancedb, open_clip, torch + +model, _, _ = open_clip.create_model_and_transforms("ViT-B-32", pretrained="laion2b_s34b_b79k") +tokenizer = open_clip.get_tokenizer("ViT-B-32") +model = model.eval().cuda().half() +with torch.no_grad(): + q = model.encode_text(tokenizer(["a giraffe eating leaves"]).cuda()) + q = (q / q.norm(dim=-1, keepdim=True)).float().cpu().numpy()[0] + +db = lancedb.connect("hf://datasets/lance-format/coco-captions-2017-lance/data") +tbl = db.open_table("val") + +results = ( + tbl.search(q.tolist(), vector_column_name="image_emb") + .metric("cosine") + .select(["image_id", "caption"]) + .limit(10) + .to_list() +) +``` + +Full-text search: + +```python +ds = lance.dataset("hf://datasets/lance-format/coco-captions-2017-lance/data/val.lance") +hits = ds.scanner( + full_text_query="surfer riding a wave", + columns=["image_id", "caption"], + limit=10, +).to_table().to_pylist() +``` + +### LanceDB full-text search + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/coco-captions-2017-lance/data") +tbl = db.open_table("val") + +results = ( + tbl.search("surfer riding a wave") + .select(["image_id", "caption"]) + .limit(10) + .to_list() +) +``` + +## Why Lance? + +- One dataset carries images + image embeddings + text embeddings + indices — no sidecar files. +- On-disk vector and full-text indices live next to the data, so search works on local copies and on the Hub. +- Schema evolution: add columns (new captions, alternate embeddings, model predictions) without rewriting the data. + +## Source & license + +Converted from [`lmms-lab/COCO-Caption2017`](https://huggingface.co/datasets/lmms-lab/COCO-Caption2017). Original COCO 2017 annotations are released under [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/); the underlying images are subject to Flickr terms of service. Please review the [COCO Terms of Use](https://cocodataset.org/#termsofuse) before redistribution. + +## Citation + +``` +@inproceedings{lin2014microsoft, + title={Microsoft COCO: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European Conference on Computer Vision (ECCV)}, + year={2014}, +} +``` diff --git a/docs/datasets/coco-detection-2017.mdx b/docs/datasets/coco-detection-2017.mdx new file mode 100644 index 00000000..0b16696e --- /dev/null +++ b/docs/datasets/coco-detection-2017.mdx @@ -0,0 +1,202 @@ +--- +title: "COCO 2017 Detection" +sidebarTitle: "COCO 2017 Detection" +description: "Lance-formatted version of the COCO 2017 object detection benchmark — sourced from detection-datasets/coco — with 123,287 images and the full per-image list of bounding boxes, category labels, and CLIP image embeddings, all stored inline." +--- + + +Source dataset card and downloadable files for `lance-format/coco-detection-2017-lance`. + + +Lance-formatted version of the [COCO 2017 object detection benchmark](https://cocodataset.org/) — sourced from [`detection-datasets/coco`](https://huggingface.co/datasets/detection-datasets/coco) — with **123,287 images** and the full per-image list of bounding boxes, category labels, and CLIP image embeddings, all stored inline. + +## Why this version? + +Object detection datasets typically split images, annotations, and embeddings across multiple files (often three different formats: JPEG, JSON, NumPy). Lance keeps all of it in one tabular dataset: + +- one row per image, +- the JPEG bytes, the bounding box list, the category labels, and the CLIP image embedding all live as columns on the same row, +- `IVF_PQ` on the embedding column lets you do visual similarity search without leaving the dataset, and `LABEL_LIST` on `categories_present` lets you filter to "images containing a dog and a frisbee" in milliseconds. + +## Splits + +| Split | Rows | +|-------|------| +| `train.lance` | 117,000+ | +| `val.lance` | 4,950+ | + +(Counts come from the `detection-datasets/coco` redistribution; box counts: ~860k train / ~37k val.) + +## Schema + +| Column | Type | Notes | +|---|---|---| +| `id` | `int64` | Row index within split | +| `image` | `large_binary` | Inline JPEG bytes | +| `image_id` | `int64` | COCO image id | +| `width`, `height` | `int32` | Image dimensions in pixels | +| `bboxes` | `list>` | Each box is `[x_min, y_min, x_max, y_max]` in absolute pixel coords | +| `categories` | `list` | COCO 80-class id (0-79) | +| `category_names` | `list` | Human-readable class name per object (e.g. `person`, `dog`, …) | +| `areas` | `list` | Bounding-box area (pixels²) | +| `num_objects` | `int32` | Number of annotated objects in the image | +| `categories_present` | `list` | Deduped class names — feeds the `LABEL_LIST` index for fast filtering | +| `image_emb` | `fixed_size_list` | OpenCLIP `ViT-B-32` image embedding (cosine-normalized) | + +## Pre-built indices + +- `IVF_PQ` on `image_emb` — `metric=cosine` +- `BTREE` on `image_id`, `num_objects` +- `LABEL_LIST` on `categories_present` — supports `array_has_any` / `array_has_all` predicates + +## Quick start + +```python +import lance + +ds = lance.dataset("hf://datasets/lance-format/coco-detection-2017-lance/data/val.lance") +print(ds.count_rows(), ds.schema.names, ds.list_indices()) +``` + +## Load with LanceDB + +These tables can also be consumed by [LanceDB](https://lancedb.github.io/lancedb/), the multimodal lakehouse and embedded search library built on top of Lance, for simplified vector search and other queries. + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/coco-detection-2017-lance/data") +tbl = db.open_table("val") +print(f"LanceDB table opened with {len(tbl)} images") +``` + +> **Tip — for production use, download locally first.** +> ```bash +> hf download lance-format/coco-detection-2017-lance --repo-type dataset --local-dir ./coco-detection-2017-lance +> ``` + +## Read one annotated image + +```python +import io +import lance +from PIL import Image, ImageDraw + +ds = lance.dataset("hf://datasets/lance-format/coco-detection-2017-lance/data/val.lance") +row = ds.take([0], columns=["image", "bboxes", "category_names", "width", "height"]).to_pylist()[0] + +img = Image.open(io.BytesIO(row["image"])).convert("RGB") +draw = ImageDraw.Draw(img) +for (x1, y1, x2, y2), name in zip(row["bboxes"], row["category_names"]): + draw.rectangle([x1, y1, x2, y2], outline="red", width=3) + draw.text((x1 + 4, y1 + 4), name, fill="red") +img.save("annotated.jpg") +``` + +## Filter by classes (LABEL_LIST index) + +```python +import lance +ds = lance.dataset("hf://datasets/lance-format/coco-detection-2017-lance/data/val.lance") + +# Images that contain BOTH a person and a frisbee. +rows = ds.scanner( + filter="array_has_all(categories_present, ['person', 'frisbee'])", + columns=["image_id", "category_names"], + limit=10, +).to_table().to_pylist() + +# Images with at least 5 objects of any class. +busy = ds.scanner( + filter="num_objects >= 5", + columns=["image_id", "num_objects"], + limit=10, +).to_table().to_pylist() +``` + +### Filter by classes with LanceDB + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/coco-detection-2017-lance/data") +tbl = db.open_table("val") + +rows = ( + tbl.search() + .where("array_has_all(categories_present, ['person', 'frisbee'])") + .select(["image_id", "category_names"]) + .limit(10) + .to_list() +) + +busy = ( + tbl.search() + .where("num_objects >= 5") + .select(["image_id", "num_objects"]) + .limit(10) + .to_list() +) +``` + +## Visual similarity search + +```python +import lance +import pyarrow as pa + +ds = lance.dataset("hf://datasets/lance-format/coco-detection-2017-lance/data/val.lance") +emb_field = ds.schema.field("image_emb") +ref = ds.take([0], columns=["image_emb"]).to_pylist()[0]["image_emb"] +query = pa.array([ref], type=emb_field.type) + +neighbors = ds.scanner( + nearest={"column": "image_emb", "q": query[0], "k": 5}, + columns=["image_id", "category_names"], +).to_table().to_pylist() +``` + +### LanceDB visual similarity search + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/coco-detection-2017-lance/data") +tbl = db.open_table("val") + +ref = tbl.search().limit(1).select(["image_emb"]).to_list()[0] +query_embedding = ref["image_emb"] + +results = ( + tbl.search(query_embedding) + .metric("cosine") + .select(["image_id", "category_names"]) + .limit(5) + .to_list() +) +``` + +## Why Lance? + +- One dataset carries images + boxes + categories + areas + embeddings + indices — no JSON sidecars. +- On-disk vector and label-list indices live next to the data, so filters and ANN search work on local copies and on the Hub. +- Schema evolution: add columns (segmentation polygons, keypoints, panoptic ids, fresh embeddings) without rewriting the data. + +## Source & license + +Converted from [`detection-datasets/coco`](https://huggingface.co/datasets/detection-datasets/coco). COCO annotations are released under [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/); the underlying images are subject to Flickr terms of service. See the [COCO Terms of Use](https://cocodataset.org/#termsofuse) before redistribution. + +## Citation + +``` +@inproceedings{lin2014microsoft, + title={Microsoft COCO: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European Conference on Computer Vision (ECCV)}, + year={2014} +} +``` diff --git a/docs/datasets/docvqa.mdx b/docs/datasets/docvqa.mdx new file mode 100644 index 00000000..542a2d2f --- /dev/null +++ b/docs/datasets/docvqa.mdx @@ -0,0 +1,147 @@ +--- +title: "DocVQA" +sidebarTitle: "DocVQA" +description: "Lance-formatted version of DocVQA — VQA over document images (industry / government scans, multi-page reports, forms, receipts) — sourced from lmms-lab/DocVQA (DocVQA config)." +--- + + +Source dataset card and downloadable files for `lance-format/docvqa-lance`. + + +Lance-formatted version of [DocVQA](https://www.docvqa.org/) — VQA over document images (industry / government scans, multi-page reports, forms, receipts) — sourced from [`lmms-lab/DocVQA`](https://huggingface.co/datasets/lmms-lab/DocVQA) (`DocVQA` config). + +## Splits + +| Split | Rows | +|-------|------| +| `validation.lance` | 5,349 | +| `test.lance` | 5,188 | + +## Schema + +| Column | Type | Notes | +|---|---|---| +| `id` | `int64` | Row index within split | +| `image` | `large_binary` | Inline JPEG bytes (page image) | +| `image_id` | `string?` | DocVQA `docId` (alias) | +| `question_id` | `string?` | DocVQA `questionId` | +| `question` | `string` | Natural-language question | +| `answers` | `list` | Reference answer span(s) | +| `answer` | `string` | First reference answer (FTS target) | +| `doc_id` | `string?` | DocVQA document id | +| `ucsf_document_id` | `string?` | UCSF Industry Documents Library id | +| `ucsf_document_page_no` | `string?` | Page number within the source document | +| `data_split` | `string?` | Original split label from the source | +| `question_types` | `list` | DocVQA question-type tags (`form`, `figure`, `table`, …) | +| `image_emb` | `fixed_size_list` | CLIP image embedding (cosine-normalized) | +| `question_emb` | `fixed_size_list` | CLIP text embedding of the question | + +## Pre-built indices + +- `IVF_PQ` on `image_emb` and `question_emb` — `metric=cosine` +- `INVERTED` (FTS) on `question` and `answer` +- `BTREE` on `image_id`, `question_id`, `doc_id` +- `LABEL_LIST` on `question_types` + +## Quick start + +```python +import lance +ds = lance.dataset("hf://datasets/lance-format/docvqa-lance/data/validation.lance") +print(ds.count_rows(), ds.schema.names, ds.list_indices()) +``` + +## Load with LanceDB + +These tables can also be consumed by [LanceDB](https://lancedb.github.io/lancedb/), the multimodal lakehouse and embedded search library built on top of Lance, for simplified vector search and other queries. + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/docvqa-lance/data") +tbl = db.open_table("validation") +print(f"LanceDB table opened with {len(tbl)} document-question pairs") +``` + +### LanceDB vector search + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/docvqa-lance/data") +tbl = db.open_table("validation") + +ref = tbl.search().limit(1).select(["question_emb", "question"]).to_list()[0] +query_embedding = ref["question_emb"] + +results = ( + tbl.search(query_embedding, vector_column_name="question_emb") + .metric("cosine") + .select(["question", "answer"]) + .limit(5) + .to_list() +) +``` + +### LanceDB full-text search + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/docvqa-lance/data") +tbl = db.open_table("validation") + +results = ( + tbl.search("invoice total") + .select(["question", "answer"]) + .limit(10) + .to_list() +) +``` + +## Filter by question type + +```python +import lance +ds = lance.dataset("hf://datasets/lance-format/docvqa-lance/data/validation.lance") +forms = ds.scanner( + filter="array_has_any(question_types, ['form'])", + columns=["question", "answer"], + limit=5, +).to_table() +``` + +### Filter with LanceDB + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/docvqa-lance/data") +tbl = db.open_table("validation") +forms = ( + tbl.search() + .where("array_has_any(question_types, ['form'])") + .select(["question", "answer"]) + .limit(5) + .to_list() +) +``` + +## Source & license + +Converted from [`lmms-lab/DocVQA`](https://huggingface.co/datasets/lmms-lab/DocVQA). DocVQA is released under the MIT license; the underlying documents come from the [UCSF Industry Documents Library](https://www.industrydocuments.ucsf.edu/) — review their access conditions before redistribution. + +## Citation + +``` +@inproceedings{mathew2021docvqa, + title={DocVQA: A Dataset for VQA on Document Images}, + author={Mathew, Minesh and Karatzas, Dimosthenis and Jawahar, CV}, + booktitle={Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision (WACV)}, + year={2021} +} +``` diff --git a/docs/datasets/eurosat.mdx b/docs/datasets/eurosat.mdx new file mode 100644 index 00000000..c9ca0941 --- /dev/null +++ b/docs/datasets/eurosat.mdx @@ -0,0 +1,141 @@ +--- +title: "EuroSAT" +sidebarTitle: "EuroSAT" +description: "Lance-formatted version of EuroSAT — Sentinel-2 satellite imagery (RGB) covering 27,000 64×64 tiles across 10 land-cover classes, sourced from blanchon/EuroSAT_RGB." +--- + + +Source dataset card and downloadable files for `lance-format/eurosat-lance`. + + +Lance-formatted version of [EuroSAT](https://github.com/phelber/eurosat) — Sentinel-2 satellite imagery (RGB) covering **27,000 64×64 tiles** across 10 land-cover classes, sourced from [`blanchon/EuroSAT_RGB`](https://huggingface.co/datasets/blanchon/EuroSAT_RGB). + +This is the canonical "geo" tile-level classification benchmark, useful for remote sensing pre-training and small-tile retrieval research. + +## Splits + +| Split | Rows | +|-------|------| +| `train.lance` | 16,200 | +| `validation.lance` | 5,400 | +| `test.lance` | 5,400 | + +## Schema + +| Column | Type | Notes | +|---|---|---| +| `id` | `int64` | Row index within split | +| `image` | `large_binary` | Inline JPEG bytes (64×64 RGB Sentinel-2) | +| `label` | `int32` | Class id (0-9) | +| `label_name` | `string` | `Annual_Crop`, `Forest`, `Herbaceous_Vegetation`, `Highway`, `Industrial_Buildings`, `Pasture`, `Permanent_Crop`, `Residential_Buildings`, `River`, `SeaLake` | +| `image_emb` | `fixed_size_list` | OpenCLIP `ViT-B-32` image embedding (cosine-normalized) | + +## Pre-built indices + +- `IVF_PQ` on `image_emb` — `metric=cosine` +- `BTREE` on `label` +- `BITMAP` on `label_name` + +## Quick start + +```python +import lance + +ds = lance.dataset("hf://datasets/lance-format/eurosat-lance/data/train.lance") +print(ds.count_rows(), ds.schema.names, ds.list_indices()) +``` + +## Load with LanceDB + +These tables can also be consumed by [LanceDB](https://lancedb.github.io/lancedb/), the multimodal lakehouse and embedded search library built on top of Lance, for simplified vector search and other queries. + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/eurosat-lance/data") +tbl = db.open_table("train") +print(f"LanceDB table opened with {len(tbl)} satellite tiles") +``` + +## Visual similarity search + +```python +import lance +import pyarrow as pa + +ds = lance.dataset("hf://datasets/lance-format/eurosat-lance/data/train.lance") +emb_field = ds.schema.field("image_emb") +ref = ds.take([0], columns=["image_emb", "label_name"]).to_pylist()[0] +query = pa.array([ref["image_emb"]], type=emb_field.type) + +hits = ds.scanner( + nearest={"column": "image_emb", "q": query[0], "k": 5, "nprobes": 16, "refine_factor": 30}, + columns=["id", "label_name"], +).to_table().to_pylist() +print(f"reference: {ref['label_name']}") +for h in hits: + print(h) +``` + +### LanceDB visual similarity search + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/eurosat-lance/data") +tbl = db.open_table("train") + +ref = tbl.search().limit(1).select(["image_emb", "label_name"]).to_list()[0] +query_embedding = ref["image_emb"] + +results = ( + tbl.search(query_embedding) + .metric("cosine") + .select(["id", "label_name"]) + .limit(5) + .to_list() +) +``` + +## Filter by class + +```python +import lance +ds = lance.dataset("hf://datasets/lance-format/eurosat-lance/data/train.lance") +rivers = ds.scanner(filter="label_name = 'River'", columns=["id"], limit=5).to_table() +``` + +### Filter by class with LanceDB + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/eurosat-lance/data") +tbl = db.open_table("train") +rivers = tbl.search().where("label_name = 'River'").select(["id"]).limit(5).to_list() +``` + +## Why Lance? + +- One dataset for tiles + embeddings + indices — no sidecar TIF folder per class. +- On-disk vector and FTS indices live next to the data, so search works on local copies and on the Hub. +- Schema evolution: add columns (multi-spectral channels, model predictions, fresh embeddings) without rewriting the data. + +## Source & license + +Converted from [`blanchon/EuroSAT_RGB`](https://huggingface.co/datasets/blanchon/EuroSAT_RGB). EuroSAT is released under the MIT license by Helber et al. The underlying Sentinel-2 imagery is © European Space Agency, made available under the [Copernicus open data policy](https://www.copernicus.eu/en/access-data/copyright-and-licences). + +## Citation + +``` +@inproceedings{helber2019eurosat, + title={EuroSAT: A novel dataset and deep learning benchmark for land use and land cover classification}, + author={Helber, Patrick and Bischke, Benjamin and Dengel, Andreas and Borth, Damian}, + journal={IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing}, + year={2019} +} +``` diff --git a/docs/datasets/fashion-mnist.mdx b/docs/datasets/fashion-mnist.mdx new file mode 100644 index 00000000..08de318f --- /dev/null +++ b/docs/datasets/fashion-mnist.mdx @@ -0,0 +1,158 @@ +--- +title: "Fashion-MNIST" +sidebarTitle: "Fashion-MNIST" +description: "A Lance-formatted version of Fashion-MNIST with 70,000 28×28 grayscale clothing images stored inline alongside CLIP embeddings and a pre-built IVF_PQ ANN index." +--- + + +Source dataset card and downloadable files for `lance-format/fashion-mnist-lance`. + + +A Lance-formatted version of [Fashion-MNIST](https://huggingface.co/datasets/zalando-datasets/fashion_mnist) with **70,000 28×28 grayscale clothing images** stored inline alongside CLIP embeddings and a pre-built `IVF_PQ` ANN index. + +## Key features + +- All multimodal data (image bytes + embeddings) stored **inline** in the same Lance dataset. +- **Pre-computed CLIP embeddings** (OpenCLIP `ViT-B-32` / `laion2b_s34b_b79k`, 512-dim, L2-normalized) with an `IVF_PQ` index. +- **BTREE on `label`** and **BITMAP on `label_name`** for fast filtered scans. + +## Splits + +| Split | Rows | +|-------|------| +| `train` | 60,000 | +| `test` | 10,000 | + +## Schema + +| Column | Type | Notes | +|---|---|---| +| `id` | `int64` | Row index within the split | +| `image` | `large_binary` | Inline PNG bytes (28×28 grayscale) | +| `label` | `int32` | Class id (0-9) | +| `label_name` | `string` | One of `T-shirt/top`, `Trouser`, `Pullover`, `Dress`, `Coat`, `Sandal`, `Shirt`, `Sneaker`, `Bag`, `Ankle_boot` | +| `image_emb` | `fixed_size_list` | CLIP image embedding (cosine-normalized) | + +## Pre-built indices + +- `IVF_PQ` on `image_emb` — `metric=cosine` +- `BTREE` on `label` +- `BITMAP` on `label_name` + +## Load with Lance + +```python +import lance + +ds = lance.dataset("hf://datasets/lance-format/fashion-mnist-lance/data/train.lance") +print(ds.count_rows(), ds.schema.names, ds.list_indices()) +``` + +## Load with LanceDB + +These tables can also be consumed by [LanceDB](https://lancedb.github.io/lancedb/), the multimodal lakehouse and embedded search library built on top of Lance, for simplified vector search and other queries. + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/fashion-mnist-lance/data") +tbl = db.open_table("train") +print(f"LanceDB table opened with {len(tbl)} images") +``` + +## Load with `datasets.load_dataset` + +```python +import datasets + +hf_ds = datasets.load_dataset("lance-format/fashion-mnist-lance", split="train", streaming=True) +for row in hf_ds.take(3): + print(row["label_name"]) +``` + +> **Tip — for production use, download locally first** to avoid Hub rate limits: +> ```bash +> hf download lance-format/fashion-mnist-lance --repo-type dataset --local-dir ./fashion-mnist-lance +> ``` + +## Vector search example + +```python +import lance +import pyarrow as pa + +ds = lance.dataset("hf://datasets/lance-format/fashion-mnist-lance/data/train.lance") +emb_field = ds.schema.field("image_emb") +ref = ds.take([0], columns=["image_emb"]).to_pylist()[0]["image_emb"] +query = pa.array([ref], type=emb_field.type) + +neighbors = ds.scanner( + nearest={"column": "image_emb", "q": query[0], "k": 5, "nprobes": 16, "refine_factor": 30}, + columns=["id", "label_name"], +).to_table().to_pylist() +``` + +### LanceDB vector search + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/fashion-mnist-lance/data") +tbl = db.open_table("train") + +ref = tbl.search().limit(1).select(["image_emb"]).to_list()[0] +query_embedding = ref["image_emb"] + +results = ( + tbl.search(query_embedding) + .metric("cosine") + .select(["id", "label_name"]) + .limit(5) + .to_list() +) +``` + +## Filter by class + +```python +import lance +ds = lance.dataset("hf://datasets/lance-format/fashion-mnist-lance/data/train.lance") +sneakers = ds.scanner(filter="label_name = 'Sneaker'", columns=["id"], limit=5).to_table() +``` + +### Filter by class with LanceDB + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/fashion-mnist-lance/data") +tbl = db.open_table("train") +sneakers = tbl.search().where("label_name = 'Sneaker'").select(["id"]).limit(5).to_list() +``` + +## Why Lance? + +- One dataset for images + embeddings + indices + metadata — no sidecar files. +- On-disk vector and FTS indices live next to the data, so search works on local copies and the Hub. +- Schema evolution: add new columns (model predictions, fresh embeddings, augmentations) without rewriting the data. + +## Source & license + +Converted from [`zalando-datasets/fashion_mnist`](https://huggingface.co/datasets/zalando-datasets/fashion_mnist). Released under the MIT license. + +## Citation + +``` +@online{xiao2017fashionmnist, + title={Fashion-MNIST: a Novel Image Dataset for Benchmarking Machine Learning Algorithms}, + author={Xiao, Han and Rasul, Kashif and Vollgraf, Roland}, + year={2017}, + eprint={1708.07747}, + archivePrefix={arXiv}, + primaryClass={cs.LG} +} +``` diff --git a/docs/datasets/fineweb-edu.mdx b/docs/datasets/fineweb-edu.mdx new file mode 100644 index 00000000..97f36df7 --- /dev/null +++ b/docs/datasets/fineweb-edu.mdx @@ -0,0 +1,246 @@ +--- +title: "FineWeb-Edu" +sidebarTitle: "FineWeb-Edu" +description: "FineWeb-edu dataset with over 1.5 billion rows. Each passage ships with cleaned text, metadata, and 384-dim text embeddings for retrieval-heavy workloads." +--- + + +Source dataset card and downloadable files for `lance-format/fineweb-edu`. + + +FineWeb-edu dataset with over 1.5 billion rows. Each passage ships with cleaned text, metadata, and 384-dim text embeddings for retrieval-heavy workloads. + + +## Load via `datasets.load_dataset` + +```python +import datasets + +hf_ds = datasets.load_dataset( + "lance-format/fineweb-edu", + split="train", + streaming=True, +) +# Take first three rows and print titles +for row in hf_ds.take(3): + print(row["title"]) +``` + +Use Lance's native connector when you need ANN search, FTS, or direct access to embeddings while still pointing to the copy hosted on Hugging Face: + +```python +import lance + +ds = lance.dataset("hf://datasets/lance-format/fineweb-edu/data/train.lance")print(f"Total passages: {ds.count_rows():,}") +``` + +These tables can also be consumed by [LanceDB](https://lancedb.github.io/lancedb/), the multimodal lakehouse and embedded search library built on top of Lance, for simplified vector search and other queries. + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/fineweb-edu/data") +tbl = db.open_table("train") +print(f"LanceDB table opened with {len(tbl)} passages") +``` + + + +> The dataset hosted on Hugging Face Hub does **not** currently have pre-built ANN (vector) or FTS (full-text search) indices. +> + +> - For any search or similarity workloads, you should download the dataset locally and build indices yourself. +> +> ```bash +> # Download once +> huggingface-cli download lance-format/fineweb-edu --repo-type dataset --local-dir ./fineweb-edu +> +> # Then load locally and build indices +> import lance +> ds = lance.dataset("./fineweb-edu") +> # ds.create_index(...) +> ``` +> + + +## Why Lance? + +- Optimized for AI workloads: Lance keeps multimodal data and vector search-ready storage in the same columnar format designed for accelerator-era retrieval (see [lance.org](https://lance.org)). +- Images + embeddings + metadata travel as one tabular dataset. +- On-disk, scalable ANN index means +- Schema evolution lets you add new features/columns (moderation tags, embeddings, etc.) without rewriting the raw data. + + +## Quick Start (Lance Python) + +```python +import lance +import pyarrow as pa + +lance_ds = lance.dataset("hf://datasets/lance-format/fineweb-edu/data/train.lance") + +# Browse titles & language without touching embeddings +rows = lance_ds.scanner( + columns=["title", "language"], + limit=5 +).to_table().to_pylist() + +# Vector similarity from the on-dataset ANN index +ref = lance_ds.take([0], columns=["text_embedding", "title"]) +query_vec = pa.array([ref.to_pylist()[0]["text_embedding"]], + type=ref.schema.field("text_embedding").type) + +results = lance_ds.scanner( + nearest={ + "column": "text_embedding", + "q": query_vec[0], + "k": 5, + "nprobes": 8, + "refine_factor": 20, + }, + columns=["title", "language", "text"], +).to_table().to_pylist() +``` + +> **Hugging Face Streaming Note** +> - Streaming uses conservative ANN parameters (`nprobes`, `refine_factor`) to stay within HF rate limits. +> - Prefer local copies (`huggingface-cli download lance-format/fineweb-edu --local-dir ./fineweb`) for heavy workloads, then point Lance at `./fineweb`. + +## Dataset Schema + +Common columns you'll find in this Lance dataset: +- `text` – cleaned passage content. +- `title` – page/article title when available. +- `url` – canonical source URL. +- `language` + `language_probability` – detector outputs for filtering. +- Quality metadata from FineWeb-Edu (e.g., heuristic scores or length stats). +- `text_embedding` – 384-dimension float32 vector for retrieval. + +## Usage Examples + +> **Search snippets for reference** +> The vector/FTS examples below show the Lance APIs you’ll use once indexes are available. The hosted dataset doesn’t yet ship ANN/FTS indexes—download locally (or build indexes yourself) before running them. Pre-built indexes are coming soon. + +### 1. Sample documents without embeddings + +```python +scanner = ds.scanner( + columns=["title", "language", "text"], + filter="language = 'en'", + limit=5, +) +for doc in scanner.to_table().to_pylist(): + print(doc["title"], doc["language"]) + print(doc["text"][:200], "...\n") +``` + +### 2. Vector search for semantically similar passages + +```python +ref_doc = ds.take([123], columns=["text_embedding", "title", "text"]).to_pylist()[0] +emb_type = ds.to_table(columns=["text_embedding"], limit=1).schema.field("text_embedding").type +query = pa.array([ref_doc["text_embedding"]], type=emb_type) + +neighbors = ds.scanner( + nearest={ + "column": "text_embedding", + "q": query[0], + "k": 6, + "nprobes": 8, + "refine_factor": 20, + }, + columns=["title", "language", "text"], +).to_table().to_pylist()[1:] +``` + +### LanceDB Vector Search +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/fineweb-edu/data") +tbl = db.open_table("train") + +# Get a passage to use as a query +ref_passage = tbl.limit(1).offset(123).select(["text_embedding", "text"]).to_pandas().to_dict('records')[0] +query_embedding = ref_passage["text_embedding"] + +results = tbl.search(query_embedding) \ + .limit(5) \ + .to_list() +``` + +### 3. Full-text search with Lance FTS + +```python +hits = ds.scanner( + full_text_query="quantum computing", + columns=["title", "language", "text"], + limit=10, + fast_search=True, +).to_table().to_pylist() +``` + +### LanceDB Full-Text Search +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/fineweb-edu/data") +tbl = db.open_table("train") + +results = tbl.search("quantum computing") \ + .select(["title", "language", "text"]) \ + .limit(10) \ + .to_list() +``` + + +See `fineweb_edu/example.py` on lance-huggingface repo for a complete walkthrough that combines HF streaming batches with Lance-powered retrieval. + +## Dataset Evolution + +Lance supports flexible schema and data evolution ([docs](https://lance.org/guide/data_evolution/?h=evol)). You can add/drop columns, backfill with SQL or Python, rename fields, or change data types without rewriting the whole dataset. In practice this lets you: +- Introduce fresh metadata (moderation labels, embeddings, quality scores) as new signals become available. +- Add new columns to existing datasets without re-exporting terabytes of video. +- Adjust column names or shrink storage (e.g., cast embeddings to float16) while keeping previous snapshots queryable for reproducibility. + +```python +import lance +import pyarrow as pa +import numpy as np + +# Assume ds is a local Lance dataset +# ds = lance.dataset("./fineweb_edu_local") + +base = pa.table({"id": pa.array([1, 2, 3]), "text": pa.array(["A", "B", "C"])}) +dataset = lance.write_dataset(base, "fineweb_evolution", mode="overwrite") + +# 1. Add a schema-only column (data to be added later) +dataset.add_columns(pa.field("subject", pa.string())) + +# 2. Add a column with data +dataset.add_columns({"quality_bucket": "'unknown'"}) + +# 3. Generate rich columns via Python batch UDFs +@lance.batch_udf() +def random_embedding(batch): + vecs = np.random.rand(batch.num_rows, 384).astype("float32") + return pa.RecordBatch.from_arrays( + [pa.FixedSizeListArray.from_arrays(vecs.ravel(), 384)], + names=["text_embedding"], + ) + +dataset.add_columns(random_embedding) + +# 4. Bring in annotations with merge +labels = pa.table({"id": pa.array([1, 2, 3]), "label": pa.array(["math", "history", "science"])}) +dataset.merge(labels, "id") + +# 5. Rename or cast columns as needs change +dataset.alter_columns({"path": "subject", "name": "topic"}) +dataset.alter_columns({"path": "text_embedding", "data_type": pa.list_(pa.float16(), 384)}) +``` +You can iterate on embeddings, quality tags, or moderation fields while keeping earlier dataset versions available for reproducible experiments. diff --git a/docs/datasets/flickr30k.mdx b/docs/datasets/flickr30k.mdx new file mode 100644 index 00000000..154837d8 --- /dev/null +++ b/docs/datasets/flickr30k.mdx @@ -0,0 +1,214 @@ +--- +title: "Flickr30k" +sidebarTitle: "Flickr30k" +description: "Lance-formatted version of Flickr30k (re-distributed via lmms-lab/flickr30k) — 31,783 images, each paired with 5 human-written captions, with CLIP image and text embeddings stored inline and pre-built ANN indices on both." +--- + + +Source dataset card and downloadable files for `lance-format/flickr30k-lance`. + + +Lance-formatted version of [Flickr30k](https://shannon.cs.illinois.edu/DenotationGraph/) (re-distributed via [`lmms-lab/flickr30k`](https://huggingface.co/datasets/lmms-lab/flickr30k)) — **31,783 images, each paired with 5 human-written captions**, with CLIP image **and** text embeddings stored inline and pre-built ANN indices on both. + +## Key features + +- **Inline images** — full JPEG bytes per row. +- **Pre-computed CLIP embeddings** for both image and caption text — `IVF_PQ` indices on both columns let you do cross-modal retrieval (image→caption or caption→image) without any model at query time. +- **Full-text inverted index** on the canonical caption. +- Self-contained: no sidecar files or external image downloads. + +## Schema + +| Column | Type | Notes | +|---|---|---| +| `id` | `int64` | Row index | +| `image` | `large_binary` | Inline JPEG bytes | +| `image_id` | `string` | Original Flickr image id | +| `filename` | `string` | Original filename (e.g. `1000092795.jpg`) | +| `captions` | `list` | All 5 captions for the image | +| `caption` | `string` | First caption — used as canonical text for FTS / quick browsing | +| `image_emb` | `fixed_size_list` | CLIP image embedding (cosine-normalized) | +| `text_emb` | `fixed_size_list` | CLIP text embedding of the canonical caption | + +## Pre-built indices + +- `IVF_PQ` on `image_emb` — `metric=cosine` +- `IVF_PQ` on `text_emb` — `metric=cosine` (cross-modal retrieval works out of the box) +- `INVERTED` on `caption` +- `BTREE` on `image_id` + +## Splits + +A single `train.lance` table containing all 31,783 rows (the `lmms-lab/flickr30k` redistribution exposes them as a single split). The original train/val/test labels are not preserved in the source parquet. + +## Load with Lance + +```python +import lance + +ds = lance.dataset("hf://datasets/lance-format/flickr30k-lance/data/train.lance") +print(ds.count_rows(), ds.schema.names, ds.list_indices()) +``` + +## Load with LanceDB + +These tables can also be consumed by [LanceDB](https://lancedb.github.io/lancedb/), the multimodal lakehouse and embedded search library built on top of Lance, for simplified vector search and other queries. + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/flickr30k-lance/data") +tbl = db.open_table("train") +print(f"LanceDB table opened with {len(tbl)} image-caption pairs") +``` + +## Cross-modal text→image search + +```python +import lance +import pyarrow as pa +import open_clip +import torch + +# 1. Encode the query text once with the same CLIP model used at conversion. +model, _, _ = open_clip.create_model_and_transforms("ViT-B-32", pretrained="laion2b_s34b_b79k") +tokenizer = open_clip.get_tokenizer("ViT-B-32") +model = model.eval().cuda().half() +with torch.no_grad(): + q = model.encode_text(tokenizer(["a man surfing at sunset"]).cuda()) + q = (q / q.norm(dim=-1, keepdim=True)).float().cpu().numpy()[0] + +ds = lance.dataset("hf://datasets/lance-format/flickr30k-lance/data/train.lance") +emb_field = ds.schema.field("image_emb") +query = pa.array([q.tolist()], type=emb_field.type) + +# 2. Nearest-neighbour search against the image embedding index. +hits = ds.scanner( + nearest={"column": "image_emb", "q": query[0], "k": 10, "nprobes": 16, "refine_factor": 30}, + columns=["image_id", "caption"], +).to_table().to_pylist() +for h in hits: + print(h) +``` + +### LanceDB cross-modal text→image search + +```python +import lancedb, open_clip, torch + +model, _, _ = open_clip.create_model_and_transforms("ViT-B-32", pretrained="laion2b_s34b_b79k") +tokenizer = open_clip.get_tokenizer("ViT-B-32") +model = model.eval().cuda().half() +with torch.no_grad(): + q = model.encode_text(tokenizer(["a man surfing at sunset"]).cuda()) + q = (q / q.norm(dim=-1, keepdim=True)).float().cpu().numpy()[0] + +db = lancedb.connect("hf://datasets/lance-format/flickr30k-lance/data") +tbl = db.open_table("train") + +results = ( + tbl.search(q.tolist(), vector_column_name="image_emb") + .metric("cosine") + .select(["image_id", "caption"]) + .limit(10) + .to_list() +) +``` + +## Image→caption (image-to-text retrieval) + +```python +ds = lance.dataset("hf://datasets/lance-format/flickr30k-lance/data/train.lance") +ref = ds.take([0], columns=["image_emb", "caption"]).to_pylist()[0] +emb_field = ds.schema.field("text_emb") +query = pa.array([ref["image_emb"]], type=emb_field.type) +neighbors = ds.scanner( + nearest={"column": "text_emb", "q": query[0], "k": 10}, + columns=["caption"], +).to_table().to_pylist() +``` + +### LanceDB image→caption search + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/flickr30k-lance/data") +tbl = db.open_table("train") + +ref = tbl.search().limit(1).select(["image_emb", "caption"]).to_list()[0] +query_embedding = ref["image_emb"] + +results = ( + tbl.search(query_embedding, vector_column_name="text_emb") + .metric("cosine") + .select(["caption"]) + .limit(10) + .to_list() +) +``` + +## Full-text search on captions + +```python +import lance +ds = lance.dataset("hf://datasets/lance-format/flickr30k-lance/data/train.lance") +hits = ds.scanner( + full_text_query="dog playing in the snow", + columns=["image_id", "caption"], + limit=10, +).to_table().to_pylist() +``` + +### LanceDB full-text search + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/flickr30k-lance/data") +tbl = db.open_table("train") + +results = ( + tbl.search("dog playing in the snow") + .select(["image_id", "caption"]) + .limit(10) + .to_list() +) +``` + +## Working with images + +```python +from pathlib import Path +import lance +ds = lance.dataset("hf://datasets/lance-format/flickr30k-lance/data/train.lance") +row = ds.take([0], columns=["image", "filename"]).to_pylist()[0] +Path(row["filename"]).write_bytes(row["image"]) +``` + +## Why Lance? + +- One dataset carries images + image embeddings + text embeddings + indices — no sidecar files. +- On-disk vector and full-text indices live next to the data, so search works on local copies and on the Hub. +- Schema evolution: add columns (new captions, alternate embeddings, moderation labels) without rewriting the data. + +## Source & license + +Converted from [`lmms-lab/flickr30k`](https://huggingface.co/datasets/lmms-lab/flickr30k), which is itself a parquet redistribution of the [original Flickr30k corpus](https://shannon.cs.illinois.edu/DenotationGraph/). Original images come from Flickr; review the Flickr30k licensing terms before redistribution. + +## Citation + +``` +@article{young2014image, + title={From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions}, + author={Young, Peter and Lai, Alice and Hodosh, Micah and Hockenmaier, Julia}, + journal={Transactions of the Association for Computational Linguistics}, + volume={2}, + pages={67--78}, + year={2014} +} +``` diff --git a/docs/datasets/food101.mdx b/docs/datasets/food101.mdx new file mode 100644 index 00000000..d74eca01 --- /dev/null +++ b/docs/datasets/food101.mdx @@ -0,0 +1,125 @@ +--- +title: "Food-101" +sidebarTitle: "Food-101" +description: "Lance-formatted version of Food-101 — 101,000 food photographs across 101 classes — sourced from ethz/food101. Inline JPEG bytes + CLIP image embeddings + IVF_PQ." +--- + + +Source dataset card and downloadable files for `lance-format/food101-lance`. + + +Lance-formatted version of [Food-101](https://www.kaggle.com/datasets/dansbecker/food-101) — 101,000 food photographs across 101 classes — sourced from [`ethz/food101`](https://huggingface.co/datasets/ethz/food101). Inline JPEG bytes + CLIP image embeddings + IVF_PQ. + +## Splits + +| Split | Rows | +|-------|------| +| `train.lance` | 75,750 | +| `validation.lance` | 25,250 | + +## Schema + +| Column | Type | Notes | +|---|---|---| +| `id` | `int64` | Row index within split | +| `image` | `large_binary` | Inline JPEG bytes | +| `label` | `int32` | Class id (0-100) | +| `label_name` | `string` | One of 101 dish names (`apple_pie`, `baby_back_ribs`, …) | +| `image_emb` | `fixed_size_list` | OpenCLIP `ViT-B-32` embedding (cosine-normalized) | + +## Pre-built indices + +- `IVF_PQ` on `image_emb` — `metric=cosine` +- `BTREE` on `label` +- `BITMAP` on `label_name` + +## Quick start + +```python +import lance +ds = lance.dataset("hf://datasets/lance-format/food101-lance/data/validation.lance") +print(ds.count_rows(), ds.schema.names, ds.list_indices()) +``` + +## Load with LanceDB + +These tables can also be consumed by [LanceDB](https://lancedb.github.io/lancedb/), the multimodal lakehouse and embedded search library built on top of Lance, for simplified vector search and other queries. + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/food101-lance/data") +tbl = db.open_table("validation") +print(f"LanceDB table opened with {len(tbl)} images") +``` + +## Filter by class + +```python +import lance +ds = lance.dataset("hf://datasets/lance-format/food101-lance/data/validation.lance") +sushi = ds.scanner(filter="label_name = 'sushi'", columns=["id"], limit=5).to_table() +``` + +### Filter by class with LanceDB + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/food101-lance/data") +tbl = db.open_table("validation") +sushi = tbl.search().where("label_name = 'sushi'").select(["id"]).limit(5).to_list() +``` + +## Visual similarity search + +```python +import lance, pyarrow as pa +ds = lance.dataset("hf://datasets/lance-format/food101-lance/data/validation.lance") +emb_field = ds.schema.field("image_emb") +ref = ds.take([0], columns=["image_emb", "label_name"]).to_pylist()[0] +query = pa.array([ref["image_emb"]], type=emb_field.type) +neighbors = ds.scanner( + nearest={"column": "image_emb", "q": query[0], "k": 5, "nprobes": 16, "refine_factor": 30}, + columns=["id", "label_name"], +).to_table().to_pylist() +``` + +### LanceDB visual similarity search + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/food101-lance/data") +tbl = db.open_table("validation") + +ref = tbl.search().limit(1).select(["image_emb", "label_name"]).to_list()[0] +query_embedding = ref["image_emb"] + +results = ( + tbl.search(query_embedding) + .metric("cosine") + .select(["id", "label_name"]) + .limit(5) + .to_list() +) +``` + +## Source & license + +Converted from [`ethz/food101`](https://huggingface.co/datasets/ethz/food101). The Food-101 dataset is by Bossard et al. (ETH Zurich) — see the [original dataset page](https://data.vision.ee.ethz.ch/cvl/datasets_extra/food-101/) for licensing details. + +## Citation + +``` +@inproceedings{bossard2014food, + title={Food-101 -- Mining Discriminative Components with Random Forests}, + author={Bossard, Lukas and Guillaumin, Matthieu and Van Gool, Luc}, + booktitle={European Conference on Computer Vision (ECCV)}, + year={2014} +} +``` diff --git a/docs/datasets/gqa-testdev-balanced.mdx b/docs/datasets/gqa-testdev-balanced.mdx new file mode 100644 index 00000000..fc383bb4 --- /dev/null +++ b/docs/datasets/gqa-testdev-balanced.mdx @@ -0,0 +1,153 @@ +--- +title: "GQA testdev-balanced" +sidebarTitle: "GQA testdev-balanced" +description: "Lance-formatted version of the canonical GQA testdev_balanced slice — 12,578 compositional VQA questions joined with the matching 398 images — sourced from lmms-lab/GQA." +--- + + +Source dataset card and downloadable files for `lance-format/gqa-testdev-balanced-lance`. + + +Lance-formatted version of the canonical GQA `testdev_balanced` slice — 12,578 compositional VQA questions joined with the matching 398 images — sourced from [`lmms-lab/GQA`](https://huggingface.co/datasets/lmms-lab/GQA). + +`lmms-lab/GQA` exposes instructions and images as **separate parquet configs**; this Lance dataset joins them on `imageId`, so each row has the question, the answer, the GQA reasoning-program tags, *and* the image bytes inline. + +## Splits + +| Split | Rows | Distinct images | +|-------|------|----------------| +| `testdev.lance` | 12,578 | 398 | + +> Train (`train_balanced_instructions` × `train_balanced_images`, ~943k Q's × 72k images, ~10 GB images) and val splits are not bundled by default — pass `--instr-config`/`--images-config` to `gqa/dataprep.py` to extend. + +## Schema + +| Column | Type | Notes | +|---|---|---| +| `id` | `int64` | Row index | +| `image` | `large_binary` | Inline JPEG bytes (image is duplicated across rows that share an `image_id`) | +| `image_id` | `string` | GQA scene-graph image id | +| `question_id` | `string` | GQA question id | +| `question` | `string` | Compositional natural-language question | +| `answers` | `list` | One-element list (the GQA short answer) | +| `answer` | `string` | Same short answer (canonical / FTS target) | +| `full_answer` | `string?` | Full sentence answer | +| `structural` | `string?` | One of `verify`, `query`, `compare`, `choose`, `logical` | +| `semantic` | `string?` | One of `attr`, `cat`, `global`, `obj`, `rel` | +| `detailed` | `string?` | Fine-grained type (e.g. `weatherVerifyC`) | +| `is_balanced` | `bool` | GQA balanced subset flag | +| `group_global` / `group_local` | `string?` | GQA reasoning-group ids | +| `semantic_str` | `string?` | Compact description of the reasoning program | +| `image_emb` | `fixed_size_list` | CLIP image embedding (cosine-normalized) | +| `question_emb` | `fixed_size_list` | CLIP text embedding of the question | + +## Pre-built indices + +- `IVF_PQ` on `image_emb` and `question_emb` — `metric=cosine` +- `INVERTED` (FTS) on `question` and `answer` +- `BITMAP` on `structural`, `semantic`, `detailed` +- `BTREE` on `image_id`, `question_id` + +## Quick start + +```python +import lance +ds = lance.dataset("hf://datasets/lance-format/gqa-testdev-balanced-lance/data/testdev.lance") +print(ds.count_rows(), ds.schema.names, ds.list_indices()) +``` + +## Load with LanceDB + +These tables can also be consumed by [LanceDB](https://lancedb.github.io/lancedb/), the multimodal lakehouse and embedded search library built on top of Lance, for simplified vector search and other queries. + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/gqa-testdev-balanced-lance/data") +tbl = db.open_table("testdev") +print(f"LanceDB table opened with {len(tbl)} image-question pairs") +``` + +### LanceDB vector search + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/gqa-testdev-balanced-lance/data") +tbl = db.open_table("testdev") + +ref = tbl.search().limit(1).select(["question_emb", "question"]).to_list()[0] +query_embedding = ref["question_emb"] + +results = ( + tbl.search(query_embedding, vector_column_name="question_emb") + .metric("cosine") + .select(["question", "answer"]) + .limit(5) + .to_list() +) +``` + +### LanceDB full-text search + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/gqa-testdev-balanced-lance/data") +tbl = db.open_table("testdev") + +results = ( + tbl.search("color of the car") + .select(["question", "answer"]) + .limit(10) + .to_list() +) +``` + +## Filter by reasoning type + +```python +import lance +ds = lance.dataset("hf://datasets/lance-format/gqa-testdev-balanced-lance/data/testdev.lance") +verify_qs = ds.scanner(filter="structural = 'verify'", columns=["question", "answer"], limit=5).to_table() +``` + +### Filter with LanceDB + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/gqa-testdev-balanced-lance/data") +tbl = db.open_table("testdev") +verify_qs = ( + tbl.search() + .where("structural = 'verify'") + .select(["question", "answer"]) + .limit(5) + .to_list() +) +``` + +## Why Lance? + +- One dataset for the joined image + question + answer + reasoning-program metadata + dual embeddings + indices — no instructions/images parquet split to keep in sync. +- Schema evolution: add columns (alternate scene graphs, model predictions) without rewriting the data. + +## Source & license + +Converted from [`lmms-lab/GQA`](https://huggingface.co/datasets/lmms-lab/GQA). GQA is released under [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/) by Hudson and Manning (Stanford NLP). + +## Citation + +``` +@inproceedings{hudson2019gqa, + title={GQA: A New Dataset for Real-World Visual Reasoning and Compositional Question Answering}, + author={Hudson, Drew A. and Manning, Christopher D.}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + year={2019} +} +``` diff --git a/docs/datasets/hotpotqa-distractor.mdx b/docs/datasets/hotpotqa-distractor.mdx new file mode 100644 index 00000000..f7d33393 --- /dev/null +++ b/docs/datasets/hotpotqa-distractor.mdx @@ -0,0 +1,163 @@ +--- +title: "HotpotQA distractor" +sidebarTitle: "HotpotQA distractor" +description: "Lance-formatted version of HotpotQA — multi-hop reading-comprehension questions where each answer requires combining facts from two Wikipedia paragraphs — using the distractor config (10 candidate paragraphs per question, including gold + 8…" +--- + + +Source dataset card and downloadable files for `lance-format/hotpotqa-distractor-lance`. + + +Lance-formatted version of [HotpotQA](https://hotpotqa.github.io/) — multi-hop reading-comprehension questions where each answer requires combining facts from two Wikipedia paragraphs — using the `distractor` config (10 candidate paragraphs per question, including gold + 8 distractors). Sourced from [`hotpot_qa`](https://huggingface.co/datasets/hotpot_qa). + +## Splits + +| Split | Rows | +|-------|------| +| `train.lance` | 90,447 | +| `validation.lance` | 7,405 | + +## Schema + +| Column | Type | Notes | +|---|---|---| +| `id` | `string` | HotpotQA question id | +| `question` | `string` | The question | +| `answer` | `string` | Reference short answer (yes / no / span) | +| `type` | `string?` | `bridge` or `comparison` | +| `level` | `string?` | `easy` / `medium` / `hard` | +| `supporting_titles` | `list` | Wikipedia titles that contain gold facts | +| `supporting_sent_ids` | `list` | Sentence indices into those titles | +| `context_titles` | `list` | All 10 paragraph titles (gold + distractors) | +| `context_sentences` | `list>` | Sentences per paragraph | +| `context_text` | `string` | Flattened paragraphs — feeds the FTS index | +| `num_supporting_facts` | `int32` | Number of gold supporting facts | +| `question_emb` | `fixed_size_list` | sentence-transformers `all-MiniLM-L6-v2` (cosine-normalized) | + +## Pre-built indices + +- `IVF_PQ` on `question_emb` — `metric=cosine` +- `INVERTED` (FTS) on `question` and `context_text` +- `BTREE` on `id`, `answer` +- `BITMAP` on `type`, `level` + +## Quick start + +```python +import lance +ds = lance.dataset("hf://datasets/lance-format/hotpotqa-distractor-lance/data/validation.lance") +print(ds.count_rows(), ds.schema.names, ds.list_indices()) +``` + +## Load with LanceDB + +These tables can also be consumed by [LanceDB](https://lancedb.github.io/lancedb/), the multimodal lakehouse and embedded search library built on top of Lance, for simplified vector search and other queries. + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/hotpotqa-distractor-lance/data") +tbl = db.open_table("validation") +print(f"LanceDB table opened with {len(tbl)} questions") +``` + +## Multi-hop semantic search + +```python +import lance, pyarrow as pa +from sentence_transformers import SentenceTransformer + +encoder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device="cuda") +q = encoder.encode(["which actor played in both inception and dunkirk"], normalize_embeddings=True)[0] + +ds = lance.dataset("hf://datasets/lance-format/hotpotqa-distractor-lance/data/train.lance") +emb_field = ds.schema.field("question_emb") +hits = ds.scanner( + nearest={"column": "question_emb", "q": pa.array([q.tolist()], type=emb_field.type)[0], "k": 5}, + columns=["question", "answer", "supporting_titles"], +).to_table().to_pylist() +``` + +### LanceDB semantic search + +```python +import lancedb +from sentence_transformers import SentenceTransformer + +encoder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device="cuda") +q = encoder.encode(["which actor played in both inception and dunkirk"], normalize_embeddings=True)[0] + +db = lancedb.connect("hf://datasets/lance-format/hotpotqa-distractor-lance/data") +tbl = db.open_table("train") + +results = ( + tbl.search(q.tolist(), vector_column_name="question_emb") + .metric("cosine") + .select(["question", "answer", "supporting_titles"]) + .limit(5) + .to_list() +) +``` + +### LanceDB full-text search + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/hotpotqa-distractor-lance/data") +tbl = db.open_table("train") + +results = ( + tbl.search("inception dunkirk") + .select(["question", "answer"]) + .limit(10) + .to_list() +) +``` + +## Filter by question type + +```python +import lance +ds = lance.dataset("hf://datasets/lance-format/hotpotqa-distractor-lance/data/validation.lance") +hard_compare = ds.scanner( + filter="type = 'comparison' AND level = 'hard'", + columns=["question", "answer"], + limit=10, +).to_table() +``` + +### Filter with LanceDB + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/hotpotqa-distractor-lance/data") +tbl = db.open_table("validation") +hard_compare = ( + tbl.search() + .where("type = 'comparison' AND level = 'hard'") + .select(["question", "answer"]) + .limit(10) + .to_list() +) +``` + +## Source & license + +Converted from [`hotpot_qa`](https://huggingface.co/datasets/hotpot_qa) (`distractor` config). HotpotQA is released under [CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/). + +## Citation + +``` +@inproceedings{yang2018hotpotqa, + title={HotpotQA: A Dataset for Diverse, Explainable Multi-hop Question Answering}, + author={Yang, Zhilin and Qi, Peng and Zhang, Saizheng and Bengio, Yoshua and Cohen, William W. and Salakhutdinov, Ruslan and Manning, Christopher D.}, + booktitle={Empirical Methods in Natural Language Processing (EMNLP)}, + year={2018} +} +``` diff --git a/docs/datasets/imagenet-1k-val.mdx b/docs/datasets/imagenet-1k-val.mdx new file mode 100644 index 00000000..27a2facb --- /dev/null +++ b/docs/datasets/imagenet-1k-val.mdx @@ -0,0 +1,163 @@ +--- +title: "ImageNet-1k Validation" +sidebarTitle: "ImageNet-1k Validation" +description: "A Lance-formatted version of the canonical 50,000-image ImageNet-1k validation split (also known as ILSVRC2012 val) sourced from benjamin-paine/imagenet-1k. All 50 k JPEGs are stored inline alongside CLIP embeddings and a pre-built IVF_PQ ANN index." +--- + + +Source dataset card and downloadable files for `lance-format/imagenet-1k-val-lance`. + + +A Lance-formatted version of the **canonical 50,000-image ImageNet-1k validation split** (also known as ILSVRC2012 val) sourced from [`benjamin-paine/imagenet-1k`](https://huggingface.co/datasets/benjamin-paine/imagenet-1k). All 50 k JPEGs are stored inline alongside CLIP embeddings and a pre-built `IVF_PQ` ANN index. + +> **Why only the validation split?** The 1.28 M ImageNet-1k train split is ~155 GB and is intentionally out of scope for this lance distribution. The val split is the canonical evaluation slice for image-classification benchmarks and is small enough (~6.7 GB raw, ~7 GB Lance) to ride entirely in inline storage with embeddings. + +## Splits + +| Split | Rows | +|-------|------| +| `validation.lance` | 50,000 | + +## Schema + +| Column | Type | Notes | +|---|---|---| +| `id` | `int64` | Row index within the split (0-49,999) | +| `image` | `large_binary` | Inline JPEG bytes | +| `label` | `int32` | Class id (0-999) | +| `label_name` | `string` | First synonym of the synset, underscore-spaced (e.g. `golden_retriever`) | +| `image_emb` | `fixed_size_list` | OpenCLIP `ViT-B-32` / `laion2b_s34b_b79k` embedding (cosine-normalized) | + +The full WordNet synset descriptions for each class are available in the dataset metadata under `lance:class_names` (comma-separated). + +## Pre-built indices + +- `IVF_PQ` on `image_emb` — `metric=cosine`, `num_partitions=64` +- `BTREE` on `label` +- `BITMAP` on `label_name` + +## Quick start + +```python +import lance + +ds = lance.dataset("hf://datasets/lance-format/imagenet-1k-val-lance/data/validation.lance") +print(ds.count_rows(), ds.schema.names, ds.list_indices()) +``` + +## Load with LanceDB + +These tables can also be consumed by [LanceDB](https://lancedb.github.io/lancedb/), the multimodal lakehouse and embedded search library built on top of Lance, for simplified vector search and other queries. + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/imagenet-1k-val-lance/data") +tbl = db.open_table("validation") +print(f"LanceDB table opened with {len(tbl)} images") +``` + +> **Tip — for production use, download locally first** to avoid Hub rate limits: +> ```bash +> hf download lance-format/imagenet-1k-val-lance --repo-type dataset --local-dir ./imagenet-1k-val-lance +> ``` + +## Vector search example + +```python +import lance +import pyarrow as pa + +ds = lance.dataset("hf://datasets/lance-format/imagenet-1k-val-lance/data/validation.lance") +emb_field = ds.schema.field("image_emb") +ref = ds.take([0], columns=["image_emb", "label_name"]).to_pylist()[0] +query = pa.array([ref["image_emb"]], type=emb_field.type) + +neighbors = ds.scanner( + nearest={"column": "image_emb", "q": query[0], "k": 5, "nprobes": 16, "refine_factor": 30}, + columns=["id", "label_name"], +).to_table().to_pylist() +print(f"reference: {ref['label_name']}") +for n in neighbors: + print(n) +``` + +### LanceDB vector search + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/imagenet-1k-val-lance/data") +tbl = db.open_table("validation") + +ref = tbl.search().limit(1).select(["image_emb", "label_name"]).to_list()[0] +query_embedding = ref["image_emb"] + +results = ( + tbl.search(query_embedding) + .metric("cosine") + .select(["id", "label_name"]) + .limit(5) + .to_list() +) +``` + +## Filter by class + +```python +import lance +ds = lance.dataset("hf://datasets/lance-format/imagenet-1k-val-lance/data/validation.lance") +goldens = ds.scanner(filter="label_name = 'golden_retriever'", columns=["id"], limit=5).to_table() +``` + +### Filter by class with LanceDB + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/imagenet-1k-val-lance/data") +tbl = db.open_table("validation") +goldens = ( + tbl.search() + .where("label_name = 'golden_retriever'") + .select(["id"]) + .limit(5) + .to_list() +) +``` + +## Working with images + +```python +from pathlib import Path +import lance + +ds = lance.dataset("hf://datasets/lance-format/imagenet-1k-val-lance/data/validation.lance") +row = ds.take([0], columns=["image", "label_name"]).to_pylist()[0] +Path(f"sample_{row['label_name']}.jpg").write_bytes(row["image"]) +``` + +## Why Lance? + +- One dataset for images + embeddings + indices + metadata — no sidecar files. +- On-disk vector and FTS indices live next to the data, so search works on local copies and on the Hub. +- Schema evolution: add columns (model predictions, fresh embeddings, robustness annotations) without rewriting the data. + +## Source & license + +Converted from [`benjamin-paine/imagenet-1k`](https://huggingface.co/datasets/benjamin-paine/imagenet-1k), itself a redistribution of the [ILSVRC2012 ImageNet-1k validation split](https://image-net.org/challenges/LSVRC/2012/). All use is subject to the [ImageNet terms of access](https://image-net.org/download.php) — for **research use only**. + +## Citation + +``` +@inproceedings{deng2009imagenet, + title={ImageNet: A Large-Scale Hierarchical Image Database}, + author={Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and Li, Kai and Fei-Fei, Li}, + booktitle={IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + year={2009} +} +``` diff --git a/docs/datasets/index.mdx b/docs/datasets/index.mdx new file mode 100644 index 00000000..7879f128 --- /dev/null +++ b/docs/datasets/index.mdx @@ -0,0 +1,182 @@ +--- +title: "Datasets" +sidebarTitle: "Overview" +description: "Browse Lance-format datasets ready to query directly from the Hugging Face Hub." +--- + +The [`lance-format`](https://huggingface.co/lance-format) organization on Hugging Face publishes a growing +catalog of multimodal datasets in Lance format. Each one bundles the raw data (images, audio, video, or text), +pre-computed embeddings, and on-disk vector / full-text indices as first-class columns in the same dataset — +so vector search, full-text search, and filtered scans work directly via `hf://` URIs without downloading. + +This is powered under the hood by the [Lance format's native Hugging Face integration](https://lance.org/integrations/huggingface/) +(via the [`pylance`](https://pypi.org/project/pylance/) library). LanceDB sits on top of Lance and gives you a +convenient table-style interface to query these datasets straight from the Hub: + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format//data") +tbl = db.open_table("train") + +# Vector search, full-text search, or filtered scans — directly on the Hub +results = tbl.search(query).limit(10).to_list() +``` + +Click any card below for usage examples, schema, and pre-built indices. For a complete walkthrough of the +integration itself, see the [Hugging Face Hub integration page](/integrations/ai/huggingface). + +{/* HF_SYNC:START — content between these markers is generated by `make hf-sync`. Do not edit by hand. */} + +## Image Classification + + + + `lance-format/mnist-lance` — A Lance-formatted version of the classic MNIST handwritten-digit dataset with 70,000 28×28 grayscale digits stored inline alongside CLIP image embeddings and a pre-built ANN index. + + + `lance-format/cifar10-lance` — A Lance-formatted version of CIFAR-10 with 60,000 32×32 RGB images across 10 classes, stored inline with CLIP embeddings and a pre-built IVF_PQ ANN index. + + + `lance-format/fashion-mnist-lance` — A Lance-formatted version of Fashion-MNIST with 70,000 28×28 grayscale clothing images stored inline alongside CLIP embeddings and a pre-built IVF_PQ ANN index. + + + `lance-format/food101-lance` — Lance-formatted version of Food-101 — 101,000 food photographs across 101 classes — sourced from ethz/food101. Inline JPEG bytes + CLIP image embeddings + IVF_PQ. + + + `lance-format/oxford-pets-lance` — Lance-formatted version of the Oxford-IIIT Pet dataset — 7,390 cat & dog photos across 37 breeds — sourced from pcuenq/oxford-pets. + + + `lance-format/stanford-cars-lance` — Lance-formatted version of the Stanford Cars dataset — 8,144 training images across 196 fine-grained car make/model/year classes — sourced from Multimodal-Fatima/StanfordCars_train. + + + `lance-format/imagenet-1k-val-lance` — A Lance-formatted version of the canonical 50,000-image ImageNet-1k validation split (also known as ILSVRC2012 val) sourced from benjamin-paine/imagenet-1k. All 50 k JPEGs are stored inline alongside CLIP embeddings and a pre-built IVF_PQ ANN index. + + + `lance-format/eurosat-lance` — Lance-formatted version of EuroSAT — Sentinel-2 satellite imagery (RGB) covering 27,000 64×64 tiles across 10 land-cover classes, sourced from blanchon/EuroSAT_RGB. + + + +## Object Detection & Segmentation + + + + `lance-format/coco-detection-2017-lance` — Lance-formatted version of the COCO 2017 object detection benchmark — sourced from detection-datasets/coco — with 123,287 images and the full per-image list of bounding boxes, category labels, and CLIP image embeddings, all stored inline. + + + `lance-format/pascal-voc-2012-segmentation-lance` — A Lance-formatted version of the Pascal VOC 2012 semantic segmentation split (sourced from nateraw/pascal-voc-2012) — 2,913 image / mask pairs with CLIP image embeddings stored inline and a pre-built IVF_PQ ANN index. + + + `lance-format/ade20k-lance` — Lance-formatted version of the full ADE20K scene parsing benchmark (sourced from 1aurent/ADE20K) — 27,574 scene images with semantic and instance segmentation maps, scene labels, and per-object metadata, all stored inline. + + + `lance-format/kitti-2d-detection-lance` — Lance-formatted version of the KITTI 2D Object Detection benchmark — 7,481 training images from the KITTI Vision Benchmark Suite with 2D bounding boxes plus the full 3D-box / observation-angle metadata. Sourced from nateraw/kitti so no manual… + + + +## Image Retrieval + + + + `lance-format/coco-captions-2017-lance` — Lance-formatted version of the COCO Captions 2017 corpus, redistributed via lmms-lab/COCO-Caption2017. Each row is one image with 5–7 human-written captions, CLIP image embedding, and CLIP text embedding of the canonical caption — all stored inline. + + + `lance-format/flickr30k-lance` — Lance-formatted version of Flickr30k (re-distributed via lmms-lab/flickr30k) — 31,783 images, each paired with 5 human-written captions, with CLIP image and text embeddings stored inline and pre-built ANN indices on both. + + + `lance-format/laion-1m` — A lance dataset of LAION image-text corpus (~1M rows) with inline JPEG bytes, CLIP embeddings (img_emb), and full metadata available directly from the Hub: hf://datasets/lance-format/laion-1m/data/train.lance. + + + +## Visual Question Answering + + + + `lance-format/chartqa-lance` — Lance-formatted version of ChartQA — VQA over scientific and business charts that combine logical and visual reasoning — sourced from lmms-lab/ChartQA. + + + `lance-format/docvqa-lance` — Lance-formatted version of DocVQA — VQA over document images (industry / government scans, multi-page reports, forms, receipts) — sourced from lmms-lab/DocVQA (DocVQA config). + + + `lance-format/textvqa-lance` — Lance-formatted version of TextVQA — VQA where the question requires reading text in the image — sourced from lmms-lab/textvqa. + + + `lance-format/vqav2-lance` — Lance-formatted version of VQAv2 — Visual Question Answering on COCO images, sourced from lmms-lab/VQAv2. Each row is a (image, question, 10 answers) triple with two CLIP embeddings (image + question text) so the same dataset supports both visual… + + + `lance-format/gqa-testdev-balanced-lance` — Lance-formatted version of the canonical GQA testdev_balanced slice — 12,578 compositional VQA questions joined with the matching 398 images — sourced from lmms-lab/GQA. + + + +## Text QA + + + + `lance-format/squad-v2-lance` — Lance-formatted version of SQuAD v2 — Stanford Question Answering Dataset, version 2 — with MiniLM sentence embeddings stored inline alongside the questions, contexts, and answers. + + + `lance-format/trivia-qa-lance` — Lance-formatted version of TriviaQA (rc.nocontext config) — a question-answering dataset of trivia questions paired with answer aliases — with MiniLM sentence embeddings stored inline. + + + `lance-format/hotpotqa-distractor-lance` — Lance-formatted version of HotpotQA — multi-hop reading-comprehension questions where each answer requires combining facts from two Wikipedia paragraphs — using the distractor config (10 candidate paragraphs per question, including gold + 8… + + + `lance-format/natural-questions-val-lance` — Lance-formatted version of the Natural Questions validation split — 7,830 real Google search queries with their full Wikipedia articles and 1–5 annotator labels per question. Sourced from google-research-datasets/natural_questions. + + + `lance-format/ms-marco-v2.1-lance` — Lance-formatted version of MS MARCO v2.1 — Microsoft's machine reading comprehension benchmark — with MiniLM query embeddings stored inline alongside the candidate passages and human-written answers. + + + +## Text Corpora + + + + `lance-format/fineweb-edu` — FineWeb-edu dataset with over 1.5 billion rows. Each passage ships with cleaned text, metadata, and 384-dim text embeddings for retrieval-heavy workloads. + + + +## Speech + + + + `lance-format/librispeech-clean-lance` — Lance-formatted version of the LibriSpeech ASR clean configuration (sourced from openslr/librispeech_asr). Audio is stored inline as FLAC bytes (no re-encoding); transcripts are sentence-embedded so semantic transcript search works out of the box. + + + +## Video + + + + `lance-format/openvid-lance` — Lance format version of the OpenVid dataset with 937,957 high-quality videos stored with inline video blobs, embeddings, and rich metadata. + + + +## Robotics + + + + `lance-format/lerobot-pusht-lance` — Lance-formatted version of lerobot/pusht — the canonical PushT benchmark from the Diffusion Policy paper — packaged using the same three-table layout as the existing lance-format/lerobot-xvla-soft-fold so consumers can flip between datasets without… + + + `lance-format/lerobot-xvla-soft-fold` — This dataset was created using LeRobot. + + + +{/* HF_SYNC:END */} + +## Share your own dataset + +Got a multimodal dataset you want to publish? Convert it to Lance and push it to the Hub! +Anyone who opens it gets vector search, full-text search, and filtered scans on the data out of the box, +without recreating the embeddings or indexes on their end. + + +A step-by-step walkthrough on the LanceDB blog covering CLI setup, packaging your dataset, pushing to your namespace, and writing a dataset card. + + +Or browse the [latest trending Lance datasets](https://huggingface.co/datasets?format=format:lance&sort=trending) on Hugging Face. diff --git a/docs/datasets/kitti-2d-detection.mdx b/docs/datasets/kitti-2d-detection.mdx new file mode 100644 index 00000000..cce4c249 --- /dev/null +++ b/docs/datasets/kitti-2d-detection.mdx @@ -0,0 +1,190 @@ +--- +title: "KITTI 2D Detection" +sidebarTitle: "KITTI 2D Detection" +description: "Lance-formatted version of the KITTI 2D Object Detection benchmark — 7,481 training images from the KITTI Vision Benchmark Suite with 2D bounding boxes plus the full 3D-box / observation-angle metadata. Sourced from nateraw/kitti so no manual…" +--- + + +Source dataset card and downloadable files for `lance-format/kitti-2d-detection-lance`. + + +Lance-formatted version of the [KITTI 2D Object Detection benchmark](https://www.cvlibs.net/datasets/kitti/eval_object.php?obj_benchmark=2d) — 7,481 training images from the KITTI Vision Benchmark Suite with 2D bounding boxes plus the full 3D-box / observation-angle metadata. Sourced from [`nateraw/kitti`](https://huggingface.co/datasets/nateraw/kitti) so no manual signup or download from cvlibs.net is required. + +KITTI is the canonical autonomous-driving 2D / 3D detection benchmark — useful for AV perception research, robust real-world benchmarking, and as a small-scale companion to nuScenes / Waymo. + +## Splits + +| Split | Rows | +|-------|------| +| `train.lance` | 7,481 | + +(The `test` split has no labels published, so we omit it. Add it back via `--splits train test` if you want the unlabeled images as well.) + +## Schema + +| Column | Type | Notes | +|---|---|---| +| `id` | `int64` | Row index within split | +| `image` | `large_binary` | Inline JPEG bytes (re-encoded from the source PNG) | +| `bboxes` | `list>` | 2D box per object — `[left, top, right, bottom]` in pixel coords | +| `alphas` | `list` | Observation angle (radians, KITTI convention) | +| `dimensions` | `list>` | 3D box `(h, w, l)` in metres | +| `locations` | `list>` | 3D centre `(x, y, z)` in camera coords (metres) | +| `rotation_y` | `list` | Yaw angle in camera coords (radians) | +| `occluded` | `list` | KITTI occlusion flag (0=visible, 1=partly, 2=largely, 3=unknown) | +| `truncated` | `list` | Truncation fraction (0.0-1.0) | +| `types` | `list` | Class name per object (e.g. `Car`, `Pedestrian`, `Cyclist`, `DontCare`) | +| `num_objects` | `int32` | Number of annotated objects | +| `types_present` | `list` | Deduped class names — feeds the LABEL_LIST index | +| `image_emb` | `fixed_size_list` | OpenCLIP `ViT-B-32` image embedding (cosine-normalized) | + +## Pre-built indices + +- `IVF_PQ` on `image_emb` — `metric=cosine` +- `BTREE` on `num_objects` +- `LABEL_LIST` on `types_present` + +## Quick start + +```python +import lance + +ds = lance.dataset("hf://datasets/lance-format/kitti-2d-detection-lance/data/train.lance") +print(ds.count_rows(), ds.schema.names, ds.list_indices()) +``` + +## Load with LanceDB + +These tables can also be consumed by [LanceDB](https://lancedb.github.io/lancedb/), the multimodal lakehouse and embedded search library built on top of Lance, for simplified vector search and other queries. + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/kitti-2d-detection-lance/data") +tbl = db.open_table("train") +print(f"LanceDB table opened with {len(tbl)} frames") +``` + +## Read a frame with annotations + +```python +import io +import lance +from PIL import Image, ImageDraw + +ds = lance.dataset("hf://datasets/lance-format/kitti-2d-detection-lance/data/train.lance") +row = ds.take([0], columns=["image", "bboxes", "types"]).to_pylist()[0] + +img = Image.open(io.BytesIO(row["image"])).convert("RGB") +draw = ImageDraw.Draw(img) +for (l, t, r, b), cls in zip(row["bboxes"], row["types"]): + if cls == "DontCare": + continue + draw.rectangle([l, t, r, b], outline="lime", width=2) + draw.text((l + 4, t + 2), cls, fill="lime") +img.save("kitti.jpg") +``` + +## Filter by classes + +```python +import lance +ds = lance.dataset("hf://datasets/lance-format/kitti-2d-detection-lance/data/train.lance") + +# Frames containing both a Car and a Cyclist (LABEL_LIST index makes this fast). +both = ds.scanner( + filter="array_has_all(types_present, ['Car', 'Cyclist'])", + columns=["id", "types_present"], + limit=10, +).to_table() + +# Frames with at least 10 objects (for crowded-scene experiments). +crowded = ds.scanner(filter="num_objects >= 10", columns=["id"], limit=10).to_table() +``` + +### Filter by classes with LanceDB + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/kitti-2d-detection-lance/data") +tbl = db.open_table("train") + +both = ( + tbl.search() + .where("array_has_all(types_present, ['Car', 'Cyclist'])") + .select(["id", "types_present"]) + .limit(10) + .to_list() +) + +crowded = ( + tbl.search() + .where("num_objects >= 10") + .select(["id"]) + .limit(10) + .to_list() +) +``` + +## Visual similarity search + +```python +import lance +import pyarrow as pa + +ds = lance.dataset("hf://datasets/lance-format/kitti-2d-detection-lance/data/train.lance") +emb_field = ds.schema.field("image_emb") +ref = ds.take([0], columns=["image_emb"]).to_pylist()[0]["image_emb"] +query = pa.array([ref], type=emb_field.type) + +neighbors = ds.scanner( + nearest={"column": "image_emb", "q": query[0], "k": 5, "nprobes": 16, "refine_factor": 30}, + columns=["id", "types_present"], +).to_table().to_pylist() +``` + +### LanceDB visual similarity search + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/kitti-2d-detection-lance/data") +tbl = db.open_table("train") + +ref = tbl.search().limit(1).select(["image_emb"]).to_list()[0] +query_embedding = ref["image_emb"] + +results = ( + tbl.search(query_embedding) + .metric("cosine") + .select(["id", "types_present"]) + .limit(5) + .to_list() +) +``` + +## Why Lance? + +- One dataset for images + 2D + 3D annotations + embeddings + indices — no parallel `image_2/` and `label_2/` folders. +- On-disk vector and label-list indices live next to the data, so search and class-based filtering work on local copies and on the Hub. +- Schema evolution: add columns (LIDAR features, alternative embeddings, model predictions) without rewriting the data. + +## Source & license + +Converted from [`nateraw/kitti`](https://huggingface.co/datasets/nateraw/kitti). KITTI is released under the [CC BY-NC-SA 3.0 license](https://creativecommons.org/licenses/by-nc-sa/3.0/) by Karlsruhe Institute of Technology and Toyota Technological Institute at Chicago — **non-commercial research use only**. See the [KITTI license page](https://www.cvlibs.net/datasets/kitti/) for details. + +## Citation + +``` +@inproceedings{geiger2012are, + title={Are we ready for autonomous driving? The KITTI vision benchmark suite}, + author={Geiger, Andreas and Lenz, Philip and Urtasun, Raquel}, + booktitle={IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + year={2012} +} +``` diff --git a/docs/datasets/laion-1m.mdx b/docs/datasets/laion-1m.mdx new file mode 100644 index 00000000..2f1d2e40 --- /dev/null +++ b/docs/datasets/laion-1m.mdx @@ -0,0 +1,284 @@ +--- +title: "LAION-1M" +sidebarTitle: "LAION-1M" +description: "A lance dataset of LAION image-text corpus (~1M rows) with inline JPEG bytes, CLIP embeddings (img_emb), and full metadata available directly from the Hub: hf://datasets/lance-format/laion-1m/data/train.lance." +--- + + +Source dataset card and downloadable files for `lance-format/laion-1m`. + + +A lance dataset of LAION image-text corpus (~1M rows) with inline JPEG bytes, CLIP embeddings (`img_emb`), and full metadata available directly from the Hub: `hf://datasets/lance-format/laion-1m/data/train.lance`. + + +## Key Features + +- **Images stored inline** – the `image` column is binary data, so sampling/exporting images never leaves Lance. +- **Prebuilt ANN index** – `img_emb` ships with IVF_PQ for instant similarity search. +- **Metadata rich** – captions, URLs, NSFW flags, EXIF, dimensions, similarity scores, etc. +- **Lance<>HF integration** – access via `datasets` or connect with Lance for ANN search, image export, and any operation that needs the vector index or binary blobs. + +## Load with `datasets.load_dataset` + +```python +import datasets + +hf_ds = datasets.load_dataset( + "lance-format/laion-1m", + split="train", + streaming=True +) +# Take first three rows and print captions +for row in hf_ds.take(3): + print(row["caption"]) +``` + +## Load with Lance + +Use Lance for ANN search, image export, and any operation that needs the vector index or binary blobs: + +```python +import lance + +ds = lance.dataset("hf://datasets/lance-format/laion-1m/data/train.lance") +print(ds.count_rows()) +``` + +These tables can also be consumed by [LanceDB](https://lancedb.github.io/lancedb/), the multimodal lakehouse and embedded search library built on top of Lance, for simplified vector search and other queries. + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/laion-subset/data") +tbl = db.open_table("train") +print(f"LanceDB table opened with {len(tbl)} image-text pairs") +``` + +> **⚠️ HuggingFace Streaming Note** +> - Download the dataset locally (`huggingface-cli download lance-format/laion-1m --repo-type dataset --local-dir ./laion`) for heavy usage, then point Lance at `./laion` to use the IVF_PQ index. + + +## Why Lance? + +- Optimized for AI workloads: Lance keeps multimodal data and vector search-ready storage in the same columnar format designed for accelerator-era retrieval (see [lance.org](https://lance.org)). +- Images + embeddings + metadata travel as one tabular dataset. +- On-disk, scalable ANN index +- Schema evolution lets you add new features/columns (moderation tags, embeddings, etc.) without rewriting the raw data. + +## Quick Start (Lance) + +### Inspecting Existing Indices + +This dataset comes with a built in vector (IVF) index for image embeddings. You can inspect the prebuilt indices on the dataset: + +```python +import lance + +dataset = lance.dataset("hf://datasets/lance-format/laion-1m/data/train.lance") + +# List all indices +indices = dataset.list_indices() +print(indices) +``` + +While this dataset comes with pre-built indices, you can also create your own custom indices if needed. For example: + +```python +# ds is a local Lance dataset +ds.create_index( + "img_emb", + index_type="IVF_PQ", + num_partitions=256, + num_sub_vectors=96, + replace=True, +) +``` + +```python +# ds is a local Lance dataset +ds.create_fts_index("caption") +``` + +## Quick Start (Lance) + +```python +import lance +import pyarrow as pa + +lance_ds = lance.dataset("hf://datasets/lance-format/laion-1m/data/train.lance") + +# Vector search via img_emb IVF_PQ index +emb_field = lance_ds.schema.field("img_emb") +query = pa.array(list(range(768)), type=emb_field.type) + +neighbors = lance_ds.scanner( + nearest={ + "column": emb_field.name, + "q": query[0], + "k": 6, + "nprobes": 16, + "refine_factor": 30, + }, + columns=["caption", "url", "similarity"], +).to_table().to_pylist() +``` + +## Storing & Retrieving Multimodal Data + +```python +from pathlib import Path + +rows = lance_ds.take([0, 1], columns=["image", "caption"]).to_pylist() +for idx, row in enumerate(rows): + Path("samples").mkdir(exist_ok=True) + with open(f"samples/{idx}.jpg", "wb") as f: + f.write(row["image"]) +``` + +Images are stored inline as binary columns (regular Lance binary, not the special blob handle used in OpenVid). They behave like any other column—scan captions without touching `image`, then `take()` when you want the bytes. + +## Dataset Schema + +Core fields: +- `image_path`, `image` +- `caption`, `url` +- `NSFW` (uppercase), `similarity`, `LICENSE`, `key`, `status`, `error_message` +- `width`, `height`, `original_width`, `original_height` +- `exif`, `md5` +- `img_emb` + + +## Usage Examples + +### 1. Browse metadata + +```python +scanner = ds.scanner(columns=["caption", "url", "similarity"], limit=5) +for row in scanner.to_table().to_pylist(): + print(row) +``` + +### 2. Export images + +```python +rows = ds.take(range(3), columns=["image", "caption"]).to_pylist() +for i, row in enumerate(rows): + with open(f"sample_{i}.jpg", "wb") as f: + f.write(row["image"]) +``` + +### 3. Vector similarity search + +```python +emb_field = ds.schema.field("img_emb") +ref = ds.take([123], columns=["img_emb"]).to_pylist()[0] +query = pa.array([ref["img_emb"]], type=emb_field.type) + +neighbors = ds.scanner( + nearest={ + "column": emb_field.name, + "q": query[0], + "k": 6, + "nprobes": 16, + "refine_factor": 30, + }, + columns=["caption", "url", "similarity"], +).to_table().to_pylist() +``` + +### LanceDB Vector Similarity Search + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/laion-1m/data") +query_embedding = list(range(768)) + +results = tbl.search(query_embedding) \ + .limit(5) \ + .to_list() + +``` + +### LanceDB Full-Text Search + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/laion-1m/data") +tbl = db.open_table("train") + +results = tbl.search("dog running") \ + .select(["caption", "url", "similarity"]) \ + .limit(10) \ + .to_list() +``` + +## Dataset Evolution + +Lance supports flexible schema and data evolution ([docs](https://lance.org/guide/data_evolution/)). You can add/drop columns, backfill with SQL or Python, rename fields, or change data types without rewriting the whole dataset. In practice this lets you: +- Introduce fresh metadata (moderation labels, embeddings, quality scores) as new signals become available. +- Add new columns to existing datasets without re-exporting terabytes of video. +- Adjust column names or shrink storage (e.g., cast embeddings to float16) while keeping previous snapshots queryable for reproducibility. + +```python +import lance +import pyarrow as pa +import numpy as np + +# Assumes you ran the export to Lance example above to store a local subset of the data +# ds = lance.dataset("./laion_1m_local") + +# 1. Add a schema-only column (data to be added later) +dataset.add_columns(pa.field("moderation_label", pa.string())) + +# 2. Add a column with data backfill using a SQL expression +dataset.add_columns( + { + "moderation_label": "case WHEN \"NSFW\" > 0.5 THEN 'review' ELSE 'ok' END" + } +) + +# 3. Generate rich columns via Python batch UDFs +@lance.batch_udf() +def random_embedding(batch): + arr = np.random.rand(batch.num_rows, 128).astype("float32") + return pa.RecordBatch.from_arrays( + [pa.FixedSizeListArray.from_arrays(arr.ravel(), 128)], + names=["embedding"], + ) + +dataset.add_columns(random_embedding) + +# 4. Bring in offline annotations with merge +labels = pa.table({ + "id": pa.array([1, 2, 3]), + "label": pa.array(["horse", "rabbit", "cat"]), +}) +dataset.merge(labels, "id") + +# 5. Rename or cast columns as needs change +dataset.alter_columns({"path": "quality_bucket", "name": "quality_tier"}) +dataset.alter_columns({"path": "embedding", "data_type": pa.list_(pa.float16(), 128)}) +``` + +These operations are automatically versioned, so prior experiments can still point to earlier versions while the dataset keeps evolving. + +## Citation + +``` +@article{schuhmann2022laion5b, + title={LAION-5B: An open large-scale dataset for training next generation image-text models}, + author={Schuhmann, Christoph and others}, + journal={NeurIPS Datasets and Benchmarks Track}, + year={2022} +} +``` + +## License + +Content inherits LAION’s original licensing and safety guidelines. Review [LAION policy](https://laion.ai/blog/laion-5b/) before downstream use. diff --git a/docs/datasets/lerobot-pusht.mdx b/docs/datasets/lerobot-pusht.mdx new file mode 100644 index 00000000..2030efb5 --- /dev/null +++ b/docs/datasets/lerobot-pusht.mdx @@ -0,0 +1,123 @@ +--- +title: "LeRobot PushT" +sidebarTitle: "LeRobot PushT" +description: "Lance-formatted version of lerobot/pusht — the canonical PushT benchmark from the Diffusion Policy paper — packaged using the same three-table layout as the existing lance-format/lerobot-xvla-soft-fold so consumers can flip between datasets without…" +--- + + +Source dataset card and downloadable files for `lance-format/lerobot-pusht-lance`. + + +Lance-formatted version of [`lerobot/pusht`](https://huggingface.co/datasets/lerobot/pusht) — the canonical PushT benchmark from the [Diffusion Policy paper](https://diffusion-policy.cs.columbia.edu/) — packaged using the same three-table layout as the existing [`lance-format/lerobot-xvla-soft-fold`](https://huggingface.co/datasets/lance-format/lerobot-xvla-soft-fold) so consumers can flip between datasets without changing code. + +## Tables + +The dataset is published as three Lance tables under `data/`: + +| Table | Purpose | +|---|---| +| `frames.lance` | One row per frame — observations, actions, episode index, task index. | +| `videos.lance` | One row per source MP4 — full per-camera video stored as an inline blob. | +| `episodes.lance` | One row per episode — full timestamps + actions + per-camera video segment blobs. | + +Use `frames.lance` for low-level training (loss-per-timestep), `episodes.lance` when you need the full trajectory + matching video segments, and `videos.lance` when you want to pull entire raw videos by camera. + +## Quick start + +```python +import lance + +frames = lance.dataset("hf://datasets/lance-format/lerobot-pusht-lance/data/frames.lance") +videos = lance.dataset("hf://datasets/lance-format/lerobot-pusht-lance/data/videos.lance") +episodes = lance.dataset("hf://datasets/lance-format/lerobot-pusht-lance/data/episodes.lance") + +print("frames:", frames.count_rows()) +print("videos:", videos.count_rows()) +print("episodes:", episodes.count_rows()) +``` + +## Load with LanceDB + +These tables can also be consumed by [LanceDB](https://lancedb.github.io/lancedb/), the multimodal lakehouse and embedded search library built on top of Lance, for simplified vector search and other queries. Each `.lance` file in `data/` is a table — open by name. + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/lerobot-pusht-lance/data") + +frames = db.open_table("frames") +videos = db.open_table("videos") +episodes = db.open_table("episodes") + +print("frames:", len(frames)) +print("videos:", len(videos)) +print("episodes:", len(episodes)) +``` + +### LanceDB query example + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/lerobot-pusht-lance/data") +tbl = db.open_table("frames") + +# Browse a few frames from the first episode +results = ( + tbl.search() + .where("episode_index = 0") + .select(["episode_index", "frame_index", "timestamp"]) + .limit(5) + .to_list() +) +for row in results: + print(row) +``` + +## Pull a video segment for one episode + +```python +from pathlib import Path +import lance + +episodes = lance.dataset("hf://datasets/lance-format/lerobot-pusht-lance/data/episodes.lance") +row = episodes.take([0]).to_pylist()[0] + +# The episode row carries one ``_video_blob`` per camera angle. +for col, value in row.items(): + if col.endswith("_video_blob") and value: + Path(f"{col}.mp4").write_bytes(value) + print(f"saved {col}.mp4 ({len(value)/1e6:.1f} MB)") +``` + +## Why Lance? + +- One dataset bundles low-level frames + full-episode trajectories + raw video blobs — no scattered parquet shards or sidecar MP4 directories. +- Inline video blobs use Lance's blob encoding so metadata scans never load the bytes; you fetch them on demand via `take_blobs`. +- Schema evolution: add columns (alternate camera streams, language annotations, model predictions) without rewriting the data. + +## Source & license + +Converted from [`lerobot/pusht`](https://huggingface.co/datasets/lerobot/pusht) (LeRobot v3.0 dataset format). PushT is released under the Apache 2.0 license by the LeRobot project and the Diffusion Policy authors. + +## Citation + +``` +@misc{cadene2024lerobot, + title={LeRobot: State-of-the-art Machine Learning for Real-World Robotics in PyTorch}, + author={R{\'e}mi Cadene and Simon Alibert and Alexander Soare and Quentin Gallou{\'e}dec and Adil Zouitine and Steven Palma and Pepijn Kooijmans and Michel Aractingi and Mustafa Shukor and Martino Russi and Francesco Capuano and Caroline Pascal and Jade Choghari and Jess Moss and Thomas Wolf}, + year={2024}, + url={https://github.com/huggingface/lerobot} +} + +@inproceedings{chi2023diffusion, + title={Diffusion Policy: Visuomotor Policy Learning via Action Diffusion}, + author={Chi, Cheng and Feng, Siyuan and Du, Yilun and Xu, Zhenjia and Cousineau, Eric and Burchfiel, Benjamin and Song, Shuran}, + booktitle={Robotics: Science and Systems}, + year={2023} +} +``` diff --git a/docs/datasets/lerobot-xvla-soft-fold.mdx b/docs/datasets/lerobot-xvla-soft-fold.mdx new file mode 100644 index 00000000..6b55e7ed --- /dev/null +++ b/docs/datasets/lerobot-xvla-soft-fold.mdx @@ -0,0 +1,276 @@ +--- +title: "LeRobot X-VLA Soft-Fold" +sidebarTitle: "LeRobot X-VLA Soft-Fold" +description: "This dataset was created using LeRobot." +--- + + +Source dataset card and downloadable files for `lance-format/lerobot-xvla-soft-fold`. + + +This dataset was created using [LeRobot](https://github.com/huggingface/lerobot). + +## Dataset Description + + **Repository:** [X-VLA](https://thu-air-dream.github.io/X-VLA/) + + **License:** Apache 2.0 + + **Paper:** *Zheng et al., 2025, “X-VLA: Soft-Prompted Transformer as Scalable Cross-Embodiment Vision-Language-Action Model”* ([arXiv:2510.10274](https://arxiv.org/pdf/2510.10274)) + + +## What this dataset contains + +This is the Lance-format version of [lerobot/xvla-soft-fold](https://huggingface.co/datasets/lerobot/xvla-soft-fold), designed for efficient frame-level sampling and sequential episode loading. + +- `1,542` episodes +- `2,852,512` frames +- `20` FPS +- 3 camera streams per episode (`cam_high`, `cam_left_wrist`, `cam_right_wrist`) +- robot state vectors and action vectors aligned to frame timestamps + +## Dataset structure + +The dataset is organized under `data/` with three Lance tables: + +### Frames table + +This is the main table for model training and analytics at frame granularity. Each row is one frame with aligned state/action metadata and indexing fields so you can filter by episode, iterate temporally, or build sampled batches directly. + +Schema: +- `observation_state` (`list`): robot state vector for that frame. +- `action` (`list`): action vector for that frame. +- `time_stamp` (`float`): original source timestamp field. +- `timestamp` (`float`): canonical frame timestamp. +- `frame_index` (`int64`): frame index within episode. +- `episode_index` (`int64`): parent episode id. +- `index` (`int64`): global frame index. +- `task_index` (`int64`): task id. + +### Episodes table + +This table is optimized for sequence-aware loading. Each row represents one complete episode and stores per-episode arrays (`timestamps`, `actions`, `observation_state`) plus per-camera video blobs and timestamp ranges. Use this table when you need contiguous windows, trajectory-level batching, or synchronized decoding from episode-level video chunks. + +Schema: +- `episode_index` (`int64`, required): episode id. +- `task_index` (`int64`, required): task id. +- `fps` (`int32`, required): frame rate. +- `timestamps` (`list`): per-frame timestamps for the episode. +- `actions` (`list>`): per-frame action vectors. +- `observation_state` (`list>`): per-frame robot state vectors. +- `observation_images_cam_high_video_blob` (`large_binary` blob): encoded video segment for `cam_high`. +- `observation_images_cam_high_from_timestamp` (`double`): segment start time for `cam_high`. +- `observation_images_cam_high_to_timestamp` (`double`): segment end time for `cam_high`. +- `observation_images_cam_left_wrist_video_blob` (`large_binary` blob): encoded video segment for `cam_left_wrist`. +- `observation_images_cam_left_wrist_from_timestamp` (`double`): segment start time for `cam_left_wrist`. +- `observation_images_cam_left_wrist_to_timestamp` (`double`): segment end time for `cam_left_wrist`. +- `observation_images_cam_right_wrist_video_blob` (`large_binary` blob): encoded video segment for `cam_right_wrist`. +- `observation_images_cam_right_wrist_from_timestamp` (`double`): segment start time for `cam_right_wrist`. +- `observation_images_cam_right_wrist_to_timestamp` (`double`): segment end time for `cam_right_wrist`. + +### Videos table + +This table stores raw MP4 payloads from the source and file-level provenance metadata. It is useful when you want direct access to original encoded video assets, integrity checks (`sha256`), or custom decoding pipelines that operate on the original video files themselves, rather than episode/frame abstractions. + +Schema: +- `camera_angle` (`string`, required): camera key. +- `chunk_index` (`int32`): chunk id parsed from path. +- `file_index` (`int32`): file id parsed from path. +- `relative_path` (`string`, required): original relative path in dataset. +- `filename` (`string`, required): MP4 filename. +- `file_size_bytes` (`int64`, required): file size. +- `sha256` (`string`, required): SHA256 digest. +- `video_blob` (`large_binary`, required blob): raw MP4 bytes. + +## Usage + +In the following sections, we'll show how to work with the dataset in Lance or LanceDB. + +### Read with Lance + +```python +import lance + +root_path = "hf://datasets/lance-format/lerobot-xvla-soft-fold/data" +frames_table_name = "frames.lance" +episodes_table_name = "episodes.lance" +videos_table_name = "videos.lance" + +ds = lance.dataset(f"{root_path}/{frames_table_name}") +print(ds.count_rows()) + +ds = lance.dataset(f"{root_path}/{episodes_table_name}") +print(ds.count_rows()) + +ds = lance.dataset(f"{root_path}/{videos_table_name}") +print(ds.count_rows()) + +# 2852512 +# 1542 +# 104 +``` + +### Inspect a few frames + +```python +import lance + +root_path = "hf://datasets/lance-format/lerobot-xvla-soft-fold/data" +frames_table_name = "frames.lance" + +frames = lance.dataset(f"{root_path}/{frames_table_name}") +print(f"There are {frames.count_rows()} frames in total") + +# pip install polars +res = frames.scanner( + columns=["episode_index", "frame_index", "timestamp"], + limit=2, +).to_table() +print(res) + +# Returns +# There are 2852512 frames in total +# pyarrow.Table +# episode_index: int64 +# frame_index: int64 +# timestamp: float +# ---- +# episode_index: [[0,0]] +# frame_index: [[0,1]] +# timestamp: [[0,0.05]] +``` + +### Retrieving and saving video blobs + +```py +from pathlib import Path +import lance + +root_path = "hf://datasets/lance-format/lerobot-xvla-soft-fold/data" +episodes_table_name = "episodes.lance" +ds = lance.dataset(f"{root_path}/{episodes_table_name}") + +out = Path("video_blobs") +out.mkdir(exist_ok=True) + +# Retrieve first two videos from the episodes table +for offset in range(0, 2): + row = ( + ds.scanner( + columns=["episode_index", "observation_images_cam_high_video_blob"], + blob_handling="all_binary", + limit=2, + offset=offset, + ) + .to_table() + .to_pylist()[0] + ) + # Write the video blob to a file + (out / f"episode_{row['episode_index']}.mp4").write_bytes( + row["observation_images_cam_high_video_blob"] + ) +``` +This outputs the retrieved blobs as MP4 files in a local directory. + +### Random seek on subsets of video + +The snippet shown below reads one episode’s video blob directly from HF Hub via Lance, computes a tiny time window inside that episode, opens the blob as a stream (without downloading full data into a local file), seeks to the start timestamp, and prints the blob size plus the exact seek positions in seconds and stream PTS units. + +```py +import av +import lance + +DATASET_URI = "hf://datasets/lance-format/lerobot-xvla-soft-fold/data/episodes.lance" +EPISODE_INDEX = 30 +START_OFFSET_S = 1.0 +WINDOW_S = 0.5 + +ds = lance.dataset(DATASET_URI) +row = ds.scanner( + columns=[ + "episode_index", + "observation_images_cam_high_from_timestamp", + "observation_images_cam_high_to_timestamp", + "_rowid", + ], + with_row_id=True, + filter=f"episode_index = {EPISODE_INDEX}", + limit=1, +).to_table().to_pylist()[0] + +start_s = row["observation_images_cam_high_from_timestamp"] + START_OFFSET_S +end_s = min( + start_s + WINDOW_S, + row["observation_images_cam_high_to_timestamp"], +) + +blob = ds.take_blobs("observation_images_cam_high_video_blob", ids=[row["_rowid"]])[0] +with av.open(blob) as container: + stream = container.streams.video[0] + stream.codec_context.skip_frame = "NONKEY" + + start_pts = int(start_s / stream.time_base) + end_pts = int(end_s / stream.time_base) + container.seek(start_pts, stream=stream) + + print(f"episode_index={row['episode_index']}") + print(f"blob_size_bytes={blob.size()}") + print(f"seek_start_seconds={start_s:.3f}") + print(f"seek_end_seconds={end_s:.3f}") + print(f"seek_start_pts={start_pts}") + print(f"seek_end_pts={end_pts}") + +blob.close() +``` + +### LanceDB search + +LanceDB users can also interface with the Lance dataset on the Hub. The key step is to +connect to the dataset repo and open the relevant table. + +```py +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/lerobot-xvla-soft-fold/data") +tbl = db.open_table("episodes") + +# Search without any parameters +results = ( + tbl.search() + .select( + [ + "episode_index", + "observation_images_cam_high_from_timestamp", + "observation_images_cam_high_to_timestamp", + ] + ) + .limit(3) + .to_list() +) + +for result in results: + print( + f"{result['episode_index']} | {result['observation_images_cam_high_from_timestamp']} | {result['observation_images_cam_high_to_timestamp']}" + ) + +# Returns: +# 0 | 0.0 | 122.95 +# 1 | 122.95 | 230.65 +# 2 | 230.65 | 340.0 +``` + +### Download + +If you need to make modifications to the data or work with the raw files directly, you can do a +full download of the dataset locally. + +> **⚠️ Large dataset download** +> The full dataset is >50GB in size, so ensure you have sufficient disk space available. + +```bash +uv run hf download lance-format/lerobot-xvla-soft-fold --repo-type dataset --local-dir . +``` diff --git a/docs/datasets/librispeech-clean.mdx b/docs/datasets/librispeech-clean.mdx new file mode 100644 index 00000000..62217b91 --- /dev/null +++ b/docs/datasets/librispeech-clean.mdx @@ -0,0 +1,189 @@ +--- +title: "LibriSpeech clean" +sidebarTitle: "LibriSpeech clean" +description: "Lance-formatted version of the LibriSpeech ASR clean configuration (sourced from openslr/librispeech_asr). Audio is stored inline as FLAC bytes (no re-encoding); transcripts are sentence-embedded so semantic transcript search works out of the box." +--- + + +Source dataset card and downloadable files for `lance-format/librispeech-clean-lance`. + + +Lance-formatted version of the LibriSpeech ASR `clean` configuration (sourced from [`openslr/librispeech_asr`](https://huggingface.co/datasets/openslr/librispeech_asr)). Audio is stored inline as FLAC bytes (no re-encoding); transcripts are sentence-embedded so semantic transcript search works out of the box. + +## Splits + +| Split | Lance file | Rows | Description | +|-------|------------|------|-------------| +| `dev_clean.lance` | dev.clean | 2,703 | Standard ASR validation set | +| `test_clean.lance` | test.clean | 2,620 | Standard ASR test set | +| `train_clean_100.lance` | train.clean.100 | 28,539 | 100-hour clean training subset | + +> The 360-hour and 500-hour LibriSpeech subsets (`train.360`, `train.other.500`) are **not** bundled here. To extend the dataset, point `librispeech/dataprep.py` at additional splits. + +## Schema + +| Column | Type | Notes | +|---|---|---| +| `id` | `string` | Utterance id (e.g. `1272-128104-0000`) | +| `audio` | `large_binary` | Inline FLAC bytes (16 kHz mono) | +| `sampling_rate` | `int32` | Always 16,000 | +| `text` | `string` | Reference transcript | +| `speaker_id` | `int64` | LibriVox speaker id | +| `chapter_id` | `int64` | LibriVox chapter id | +| `num_chars` | `int32` | Length of `text` in characters | +| `text_emb` | `fixed_size_list` | sentence-transformers `all-MiniLM-L6-v2` (cosine-normalized) | + +## Pre-built indices + +- `IVF_PQ` on `text_emb` — `metric=cosine` +- `INVERTED` (FTS) on `text` +- `BTREE` on `id`, `speaker_id`, `chapter_id` + +## Quick start + +```python +import lance + +ds = lance.dataset("hf://datasets/lance-format/librispeech-clean-lance/data/test_clean.lance") +print(ds.count_rows(), ds.schema.names, ds.list_indices()) +``` + +## Load with LanceDB + +These tables can also be consumed by [LanceDB](https://lancedb.github.io/lancedb/), the multimodal lakehouse and embedded search library built on top of Lance, for simplified vector search and other queries. Each `.lance` file in `data/` is a table — open by name (e.g., `test_clean`, `train_clean_100`). + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/librispeech-clean-lance/data") +tbl = db.open_table("test_clean") +print(f"LanceDB table opened with {len(tbl)} utterances") +``` + +## Read one utterance and play it + +```python +from pathlib import Path +import lance + +ds = lance.dataset("hf://datasets/lance-format/librispeech-clean-lance/data/test_clean.lance") +row = ds.take([0], columns=["id", "audio", "text", "speaker_id"]).to_pylist()[0] + +Path(f"{row['id']}.flac").write_bytes(row["audio"]) +print("speaker:", row["speaker_id"]) +print("transcript:", row["text"]) +``` + +You can decode the FLAC bytes in-memory with `soundfile` and feed them straight into a model: + +```python +import io +import soundfile as sf + +samples, sr = sf.read(io.BytesIO(row["audio"])) +print(samples.shape, sr) +``` + +## Semantic transcript retrieval + +```python +import lance +import pyarrow as pa +from sentence_transformers import SentenceTransformer + +encoder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device="cuda") +q = encoder.encode(["a person talking about astronomy"], normalize_embeddings=True)[0] + +ds = lance.dataset("hf://datasets/lance-format/librispeech-clean-lance/data/train_clean_100.lance") +emb_field = ds.schema.field("text_emb") +hits = ds.scanner( + nearest={"column": "text_emb", "q": pa.array([q.tolist()], type=emb_field.type)[0], "k": 5}, + columns=["id", "speaker_id", "text"], +).to_table().to_pylist() +for h in hits: + print(h) +``` + +### LanceDB semantic transcript retrieval + +```python +import lancedb +from sentence_transformers import SentenceTransformer + +encoder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device="cuda") +q = encoder.encode(["a person talking about astronomy"], normalize_embeddings=True)[0] + +db = lancedb.connect("hf://datasets/lance-format/librispeech-clean-lance/data") +tbl = db.open_table("train_clean_100") + +results = ( + tbl.search(q.tolist(), vector_column_name="text_emb") + .metric("cosine") + .select(["id", "speaker_id", "text"]) + .limit(5) + .to_list() +) +``` + +## Full-text and per-speaker filtering + +```python +ds = lance.dataset("hf://datasets/lance-format/librispeech-clean-lance/data/train_clean_100.lance") + +# Word search via the FTS index. +hits = ds.scanner(full_text_query="universe stars", columns=["id", "text"], limit=10).to_table() + +# All utterances by a given speaker. +sp = ds.scanner(filter="speaker_id = 1272", columns=["id", "chapter_id", "text"], limit=10).to_table() +``` + +### LanceDB full-text search and per-speaker filtering + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/librispeech-clean-lance/data") +tbl = db.open_table("train_clean_100") + +# Word search via the FTS index. +hits = ( + tbl.search("universe stars") + .select(["id", "text"]) + .limit(10) + .to_list() +) + +# All utterances by a given speaker. +sp = ( + tbl.search() + .where("speaker_id = 1272") + .select(["id", "chapter_id", "text"]) + .limit(10) + .to_list() +) +``` + +## Why Lance? + +- One dataset for audio + transcripts + embeddings + indices — no parallel folder of FLAC files plus a transcript JSON. +- On-disk vector and full-text indices live next to the data, so search works on local copies and on the Hub. +- Schema evolution: add columns (alternate transcripts, speaker embeddings, model predictions) without rewriting the data. + +## Source & license + +Converted from [`openslr/librispeech_asr`](https://huggingface.co/datasets/openslr/librispeech_asr). LibriSpeech is released under [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/) and is built from the public-domain LibriVox audiobook corpus. + +## Citation + +``` +@inproceedings{panayotov2015librispeech, + title={LibriSpeech: An ASR corpus based on public domain audiobooks}, + author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev}, + booktitle={IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)}, + year={2015} +} +``` diff --git a/docs/datasets/mnist.mdx b/docs/datasets/mnist.mdx new file mode 100644 index 00000000..522daafc --- /dev/null +++ b/docs/datasets/mnist.mdx @@ -0,0 +1,181 @@ +--- +title: "MNIST" +sidebarTitle: "MNIST" +description: "A Lance-formatted version of the classic MNIST handwritten-digit dataset with 70,000 28×28 grayscale digits stored inline alongside CLIP image embeddings and a pre-built ANN index." +--- + + +Source dataset card and downloadable files for `lance-format/mnist-lance`. + + +A Lance-formatted version of the classic [MNIST handwritten-digit dataset](https://huggingface.co/datasets/ylecun/mnist) with **70,000 28×28 grayscale digits** stored inline alongside CLIP image embeddings and a pre-built ANN index. + +## Key features + +- All multimodal data (image bytes + embeddings) stored **inline** in the same Lance dataset — no sidecar files, no external image folders. +- **Pre-computed CLIP embeddings** (OpenCLIP `ViT-B-32` / `laion2b_s34b_b79k`, 512-dim, L2-normalized) shipped with an `IVF_PQ` index for instant similarity search. +- **BTREE index on `label`** and **BITMAP index on `label_name`** for sub-millisecond filtering. +- Standard train/test splits, ready to use with `lance.dataset(...)` or `datasets.load_dataset(...)`. + +## Splits + +| Split | Rows | +|-------|------| +| `train` | 60,000 | +| `test` | 10,000 | + +## Schema + +| Column | Type | Notes | +|---|---|---| +| `id` | `int64` | Row index within the split | +| `image` | `large_binary` | Inline PNG bytes (28×28 grayscale) | +| `label` | `int32` | Digit class id (0-9) | +| `label_name` | `string` | Human-readable class (`"0".."9"`) | +| `image_emb` | `fixed_size_list` | CLIP image embedding (cosine-normalized) | + +## Pre-built indices + +- `IVF_PQ` on `image_emb` — vector similarity search (`metric=cosine`) +- `BTREE` on `label` — fast equality / range filters +- `BITMAP` on `label_name` — fast filters on the 10 class names + +## Load with `datasets.load_dataset` + +```python +import datasets + +hf_ds = datasets.load_dataset("lance-format/mnist-lance", split="train", streaming=True) +for row in hf_ds.take(3): + print(row["label"], row["label_name"]) +``` + +## Load directly with Lance (recommended) + +```python +import lance + +ds = lance.dataset("hf://datasets/lance-format/mnist-lance/data/train.lance") +print(ds.count_rows(), ds.schema.names) +print(ds.list_indices()) +``` + +## Load with LanceDB + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/mnist-lance/data") +tbl = db.open_table("train") +print(len(tbl)) +``` + +> **Tip — for production use, download locally first.** Streaming from the Hub works for exploration, but heavy random access and ANN search are far faster against a local copy: +> ```bash +> hf download lance-format/mnist-lance --repo-type dataset --local-dir ./mnist-lance +> ``` +> Then `lance.dataset("./mnist-lance/data/train.lance")`. + +## Vector search example + +```python +import lance +import pyarrow as pa + +ds = lance.dataset("hf://datasets/lance-format/mnist-lance/data/train.lance") +emb_field = ds.schema.field("image_emb") +ref = ds.take([0], columns=["image_emb"]).to_pylist()[0]["image_emb"] +query = pa.array([ref], type=emb_field.type) + +neighbors = ds.scanner( + nearest={"column": "image_emb", "q": query[0], "k": 5, "nprobes": 16, "refine_factor": 30}, + columns=["id", "label", "label_name"], +).to_table().to_pylist() +print(neighbors) +``` + +### LanceDB vector search + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/mnist-lance/data") +tbl = db.open_table("train") + +ref = tbl.search().limit(1).select(["image_emb"]).to_list()[0] +query_embedding = ref["image_emb"] + +results = ( + tbl.search(query_embedding) + .metric("cosine") + .select(["id", "label", "label_name"]) + .limit(5) + .to_list() +) +for row in results: + print(row["id"], row["label"], row["label_name"]) +``` + +## Filter by class + +```python +ds = lance.dataset("hf://datasets/lance-format/mnist-lance/data/train.lance") +sevens = ds.scanner(filter="label = 7", columns=["id"], limit=10).to_table() +print(sevens) +``` + +### Filter by class with LanceDB + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/mnist-lance/data") +tbl = db.open_table("train") +sevens = ( + tbl.search() + .where("label = 7") + .select(["id"]) + .limit(10) + .to_list() +) +print(sevens) +``` + +## Working with images + +```python +from pathlib import Path +import lance + +ds = lance.dataset("hf://datasets/lance-format/mnist-lance/data/train.lance") +row = ds.take([0], columns=["image", "label"]).to_pylist()[0] +Path("digit_0.png").write_bytes(row["image"]) +print("label =", row["label"]) +``` + +Images are stored inline as PNG bytes; scanning columns like `label` does not pay the I/O cost of loading image bytes. + +## Why Lance? + +- One dataset for images + embeddings + indices + metadata — no sidecar files to manage. +- On-disk vector and full-text indices live next to the data, so search works on both local copies and the Hub. +- Schema evolution lets you add new columns (fresh embeddings, augmentations, model predictions) without rewriting the data ([docs](https://lance.org/guide/data_evolution/)). + +## Source & license + +Converted from [`ylecun/mnist`](https://huggingface.co/datasets/ylecun/mnist). MNIST is released under the MIT license. The original dataset is by Yann LeCun, Corinna Cortes, and Christopher J.C. Burges. + +## Citation + +``` +@article{lecun1998mnist, + title={The MNIST database of handwritten digits}, + author={LeCun, Yann and Cortes, Corinna and Burges, CJ}, + url={http://yann.lecun.com/exdb/mnist/}, + year={1998} +} +``` diff --git a/docs/datasets/ms-marco-v2.1.mdx b/docs/datasets/ms-marco-v2.1.mdx new file mode 100644 index 00000000..c54701de --- /dev/null +++ b/docs/datasets/ms-marco-v2.1.mdx @@ -0,0 +1,184 @@ +--- +title: "MS MARCO v2.1" +sidebarTitle: "MS MARCO v2.1" +description: "Lance-formatted version of MS MARCO v2.1 — Microsoft's machine reading comprehension benchmark — with MiniLM query embeddings stored inline alongside the candidate passages and human-written answers." +--- + + +Source dataset card and downloadable files for `lance-format/ms-marco-v2.1-lance`. + + +Lance-formatted version of [MS MARCO v2.1](https://huggingface.co/datasets/microsoft/ms_marco) — Microsoft's machine reading comprehension benchmark — with **MiniLM query embeddings** stored inline alongside the candidate passages and human-written answers. + +## Why this version? + +- **One self-contained Lance dataset** with ~900 k queries; each row is a query, the 10 candidate passages retrieved by Bing, the relevance flags, and the human-written reference answers. +- **Pre-computed query embeddings** (`sentence-transformers/all-MiniLM-L6-v2`, 384-dim, L2-normalized) with an `IVF_PQ` index — semantic query lookup without re-embedding. +- **Full-text inverted indices** on the query and the first selected passage. +- Designed for both retrieval research (use the index) and RAG / answer eval (use the passage list + answers). + +## Splits + +| Split | Rows | +|-------|------| +| `train.lance` | 808,731 | +| `validation.lance` | 101,093 | + +## Schema + +| Column | Type | Notes | +|---|---|---| +| `query_id` | `int64` | MS MARCO query id | +| `query` | `string` | The user's natural-language query | +| `query_type` | `string` | One of `DESCRIPTION`, `NUMERIC`, `ENTITY`, `LOCATION`, `PERSON` | +| `answers` | `list` | Human-written reference answers | +| `well_formed_answers` | `list` | Reference answers re-written as full sentences | +| `passage_text` | `list` | Up to 10 candidate passages | +| `passage_url` | `list` | Source URLs for each candidate | +| `passage_is_selected` | `list` | `1` if Bing labelled the passage relevant | +| `selected_passage` | `string?` | First relevant passage (null if none) | +| `query_emb` | `fixed_size_list` | MiniLM embedding of `query` (cosine-normalized) | + +## Pre-built indices + +- `IVF_PQ` on `query_emb` — `metric=cosine` +- `INVERTED` on `query` and `selected_passage` +- `BTREE` on `query_id` +- `BITMAP` on `query_type` + +## Quick start + +```python +import lance + +ds = lance.dataset("hf://datasets/lance-format/ms-marco-v2.1-lance/data/validation.lance") +print(ds.count_rows(), ds.schema.names, ds.list_indices()) +``` + +## Load with LanceDB + +These tables can also be consumed by [LanceDB](https://lancedb.github.io/lancedb/), the multimodal lakehouse and embedded search library built on top of Lance, for simplified vector search and other queries. + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/ms-marco-v2.1-lance/data") +tbl = db.open_table("validation") +print(f"LanceDB table opened with {len(tbl)} queries") +``` + +## Semantic query lookup + +```python +import lance +import pyarrow as pa +from sentence_transformers import SentenceTransformer + +encoder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device="cuda") +q = encoder.encode(["how to compute determinant of a 3x3 matrix"], normalize_embeddings=True)[0] + +ds = lance.dataset("hf://datasets/lance-format/ms-marco-v2.1-lance/data/validation.lance") +emb_field = ds.schema.field("query_emb") +hits = ds.scanner( + nearest={"column": "query_emb", "q": pa.array([q.tolist()], type=emb_field.type)[0], "k": 5, "nprobes": 16, "refine_factor": 30}, + columns=["query_id", "query", "selected_passage", "answers"], +).to_table().to_pylist() +for h in hits: + print(h["query"]) + print(" selected:", (h.get("selected_passage") or "")[:120]) +``` + +### LanceDB semantic query lookup + +```python +import lancedb +from sentence_transformers import SentenceTransformer + +encoder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device="cuda") +q = encoder.encode(["how to compute determinant of a 3x3 matrix"], normalize_embeddings=True)[0] + +db = lancedb.connect("hf://datasets/lance-format/ms-marco-v2.1-lance/data") +tbl = db.open_table("validation") + +results = ( + tbl.search(q.tolist(), vector_column_name="query_emb") + .metric("cosine") + .select(["query_id", "query", "selected_passage", "answers"]) + .limit(5) + .to_list() +) +``` + +### LanceDB full-text search + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/ms-marco-v2.1-lance/data") +tbl = db.open_table("validation") + +results = ( + tbl.search("determinant matrix") + .select(["query", "selected_passage"]) + .limit(10) + .to_list() +) +``` + +## Get all candidate passages for a query + +```python +import lance +ds = lance.dataset("hf://datasets/lance-format/ms-marco-v2.1-lance/data/validation.lance") +row = ds.scanner(filter="query_id = 1185869", columns=["query", "passage_text", "passage_is_selected"]).to_table().to_pylist()[0] +for text, sel in zip(row["passage_text"], row["passage_is_selected"]): + print("[selected]" if sel else "[other]", text[:120]) +``` + +## Filter by query_type + +```python +ds = lance.dataset("hf://datasets/lance-format/ms-marco-v2.1-lance/data/train.lance") +numeric = ds.scanner(filter="query_type = 'NUMERIC'", columns=["query"], limit=5).to_table() +``` + +### Filter by query_type with LanceDB + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/ms-marco-v2.1-lance/data") +tbl = db.open_table("train") +numeric = ( + tbl.search() + .where("query_type = 'NUMERIC'") + .select(["query"]) + .limit(5) + .to_list() +) +``` + +## Why Lance? + +- One dataset carries queries + passages + answers + embeddings + indices — no sidecar files. +- On-disk vector and full-text indices live next to the data, so search works on local copies and on the Hub. +- Schema evolution: add columns (alternate embeddings, generated answers, model predictions) without rewriting the data. + +## Source & license + +Converted from [`microsoft/ms_marco`](https://huggingface.co/datasets/microsoft/ms_marco) (`v2.1`). MS MARCO is released under the MIT license. + +## Citation + +``` +@article{nguyen2016ms, + title={MS MARCO: A Human Generated MAchine Reading COmprehension Dataset}, + author={Nguyen, Tri and Rosenberg, Mir and Song, Xia and Gao, Jianfeng and Tiwary, Saurabh and Majumder, Rangan and Deng, Li}, + journal={arXiv preprint arXiv:1611.09268}, + year={2016} +} +``` diff --git a/docs/datasets/natural-questions-val.mdx b/docs/datasets/natural-questions-val.mdx new file mode 100644 index 00000000..14c30fb4 --- /dev/null +++ b/docs/datasets/natural-questions-val.mdx @@ -0,0 +1,156 @@ +--- +title: "Natural Questions Validation" +sidebarTitle: "Natural Questions Validation" +description: "Lance-formatted version of the Natural Questions validation split — 7,830 real Google search queries with their full Wikipedia articles and 1–5 annotator labels per question. Sourced from google-research-datasets/natural_questions." +--- + + +Source dataset card and downloadable files for `lance-format/natural-questions-val-lance`. + + +Lance-formatted version of the [Natural Questions](https://ai.google.com/research/NaturalQuestions/) **validation split** — 7,830 real Google search queries with their full Wikipedia articles and 1–5 annotator labels per question. Sourced from [`google-research-datasets/natural_questions`](https://huggingface.co/datasets/google-research-datasets/natural_questions). + +> The NQ **train** split is 143 GB (307,373 rows); it is intentionally not bundled here. Add it via `natural_questions/dataprep.py --splits train` once disk + bandwidth allow. + +## Splits + +| Split | Rows | +|-------|------| +| `validation.lance` | 7,830 | + +## Schema + +| Column | Type | Notes | +|---|---|---| +| `id` | `string` | NQ example id | +| `question` | `string` | Original Google search query | +| `document_title` | `string` | Wikipedia article title | +| `document_url` | `string` | Wikipedia article URL | +| `document_html` | `large_binary` | Full HTML of the article (inline; UTF-8 bytes) | +| `short_answers` | `list` | Deduped short-answer spans across all annotators | +| `num_short_answers` | `int32` | Total annotator spans (incl. duplicates) | +| `has_short_answer` | `bool` | At least one annotator provided a short-answer span | +| `has_long_answer` | `bool` | At least one annotator selected a long-answer candidate | +| `yes_no_answer` | `string` | `YES` / `NO` / `NONE` — majority vote across annotators | +| `question_emb` | `fixed_size_list` | sentence-transformers `all-MiniLM-L6-v2` (cosine-normalized) | + +## Pre-built indices + +- `IVF_PQ` on `question_emb` — `metric=cosine` +- `INVERTED` (FTS) on `question` +- `BTREE` on `id`, `document_title` +- `BITMAP` on `yes_no_answer`, `has_short_answer`, `has_long_answer` + +## Quick start + +```python +import lance +ds = lance.dataset("hf://datasets/lance-format/natural-questions-val-lance/data/validation.lance") +print(ds.count_rows(), ds.schema.names, ds.list_indices()) +``` + +## Load with LanceDB + +These tables can also be consumed by [LanceDB](https://lancedb.github.io/lancedb/), the multimodal lakehouse and embedded search library built on top of Lance, for simplified vector search and other queries. + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/natural-questions-val-lance/data") +tbl = db.open_table("validation") +print(f"LanceDB table opened with {len(tbl)} questions") +``` + +### LanceDB semantic question search + +```python +import lancedb +from sentence_transformers import SentenceTransformer + +encoder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device="cuda") +q = encoder.encode(["who wrote the declaration of independence"], normalize_embeddings=True)[0] + +db = lancedb.connect("hf://datasets/lance-format/natural-questions-val-lance/data") +tbl = db.open_table("validation") + +results = ( + tbl.search(q.tolist(), vector_column_name="question_emb") + .metric("cosine") + .select(["question", "short_answers", "document_title"]) + .limit(5) + .to_list() +) +``` + +### LanceDB full-text search + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/natural-questions-val-lance/data") +tbl = db.open_table("validation") + +results = ( + tbl.search("declaration of independence") + .select(["question", "document_title"]) + .limit(10) + .to_list() +) +``` + +## Get only questions with short-answer spans + +```python +import lance +ds = lance.dataset("hf://datasets/lance-format/natural-questions-val-lance/data/validation.lance") +short = ds.scanner( + filter="has_short_answer = true", + columns=["question", "short_answers", "document_title"], + limit=10, +).to_table().to_pylist() +``` + +### Filter with LanceDB + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/natural-questions-val-lance/data") +tbl = db.open_table("validation") +short = ( + tbl.search() + .where("has_short_answer = true") + .select(["question", "short_answers", "document_title"]) + .limit(10) + .to_list() +) +``` + +## Read the full Wikipedia HTML for one question + +```python +import lance +ds = lance.dataset("hf://datasets/lance-format/natural-questions-val-lance/data/validation.lance") +row = ds.take([0], columns=["question", "document_html", "document_url"]).to_pylist()[0] +print(row["question"], "->", row["document_url"]) +print(row["document_html"][:500].decode("utf-8", errors="replace")) +``` + +## Source & license + +Converted from [`google-research-datasets/natural_questions`](https://huggingface.co/datasets/google-research-datasets/natural_questions). NQ is released under [CC BY-SA 3.0](https://creativecommons.org/licenses/by-sa/3.0/) (matching the Wikipedia source). + +## Citation + +``` +@article{kwiatkowski2019natural, + title={Natural Questions: A Benchmark for Question Answering Research}, + author={Kwiatkowski, Tom and Palomaki, Jennimaria and Redfield, Olivia and Collins, Michael and Parikh, Ankur and Alberti, Chris and Epstein, Danielle and Polosukhin, Illia and Devlin, Jacob and Lee, Kenton and Toutanova, Kristina and Jones, Llion and Kelcey, Matthew and Chang, Ming-Wei and Dai, Andrew M. and Uszkoreit, Jakob and Le, Quoc and Petrov, Slav}, + journal={Transactions of the Association for Computational Linguistics}, + year={2019} +} +``` diff --git a/docs/datasets/openvid.mdx b/docs/datasets/openvid.mdx new file mode 100644 index 00000000..3816fe14 --- /dev/null +++ b/docs/datasets/openvid.mdx @@ -0,0 +1,382 @@ +--- +title: "OpenVid-1M" +sidebarTitle: "OpenVid-1M" +description: "Lance format version of the OpenVid dataset with 937,957 high-quality videos stored with inline video blobs, embeddings, and rich metadata." +--- + + +Source dataset card and downloadable files for `lance-format/openvid-lance`. + + +Lance format version of the [OpenVid dataset](https://huggingface.co/datasets/nkp37/OpenVid-1M) with **937,957 high-quality videos** stored with inline video blobs, embeddings, and rich metadata. + +![](https://huggingface.co/datasets/nkp37/OpenVid-1M/resolve/main/OpenVid-1M.png) + +**Key Features:** +The dataset is stored in lance format with inline video blobs, video embeddings, and rich metadata. + +- **Videos stored inline as blobs** - No external files to manage +- **Efficient column access** - Load metadata without touching video data +- **Prebuilt indices available** - IVF_PQ index for similarity search, FTS index on captions +- **Fast random access** - Read any video instantly by index +- **HuggingFace integration** - Load directly from the Hub + +## Load lance dataset using `datasets.load_dataset` + +```python +import datasets + +hf_ds = datasets.load_dataset( + "lance-format/openvid-lance", + split="train", + streaming=True, +) +# Take first three rows and print captions +for row in hf_ds.take(3): + print(row["caption"]) +``` + +You can also load lance datasets from HF hub using native API when you want blob bytes or advanced indexing while still pointing at the same dataset on the Hub: + +```python +import lance + +lance_ds = lance.dataset("hf://datasets/lance-format/openvid-lance/data/train.lance") +blob_file = lance_ds.take_blobs("video_blob", ids=[0])[0] +video_bytes = blob_file.read() +``` + +These tables can also be consumed by [LanceDB](https://lancedb.github.io/lancedb/), the multimodal lakehouse and embedded search library built on top of Lance, for simplified vector search and other queries. + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/openvid-lance/data") +tbl = db.open_table("train") +print(f"LanceDB table opened with {len(tbl)} videos") +``` + + +## Why Lance? + +- Optimized for AI workloads: Lance keeps multimodal data and vector search-ready storage in the same columnar format designed for accelerator-era retrieval (see [lance.org](https://lance.org)). +- Images + embeddings + metadata travel as one tabular dataset. +- On-disk, scalable ANN index means +- Schema evolution lets you add new features/columns (moderation tags, embeddings, etc.) without rewriting the raw data. + + +## Lance Blob API + +Lance stores videos as **inline blobs** - binary data embedded directly in the dataset. This provides: + +- **Single source of truth** - Videos and metadata together in one dataset +- **Lazy loading** - Videos only loaded when you explicitly request them +- **Efficient storage** - Optimized encoding for large binary data +- **Transactional consistency** - Query and retrieve in one atomic operation + + +```python +import lance + +ds = lance.dataset("hf://datasets/lance-format/openvid-lance") + +# 1. Browse metadata without loading video data +metadata = ds.scanner( + columns=["caption", "aesthetic_score"], # No video_blob column! + filter="aesthetic_score >= 4.5", + limit=10 +).to_table().to_pylist() + +# 2. User selects video to watch +selected_index = 3 + +# 3. Load only that video blob +blob_file = ds.take_blobs("video_blob", ids=[selected_index])[0] +video_bytes = blob_file.read() + +# 4. Save to disk +with open("video.mp4", "wb") as f: + f.write(video_bytes) +``` + +## Quick Start + +```python +import lance + +ds = lance.dataset("hf://datasets/lance-format/openvid-lance/data/train.lance") +print(f"Total videos: {ds.count_rows():,}") +``` + +> **⚠️ HuggingFace Streaming Note** +> +> When streaming from HuggingFace (as shown above), some operations use minimal parameters to avoid rate limits: +> - `nprobes=1` for vector search (lowest value) +> - Column selection to reduce I/O +> +> **You may still hit rate limits on HuggingFace's free tier.** For best performance and to avoid rate limits, **download the dataset locally**: +> +> ```bash +> # Download once +> huggingface-cli download lance-format/openvid-lance --repo-type dataset --local-dir ./openvid +> +> # Then load locally +> ds = lance.dataset("./openvid") +> ``` +> +> Streaming is recommended only for quick exploration and testing. + + +## Dataset Schema + +Each row contains: +- `video_blob` - Video file as binary blob (inline storage) +- `caption` - Text description of the video +- `embedding` - 1024-dim vector embedding +- `aesthetic_score` - Visual quality score (0-5+) +- `motion_score` - Amount of motion (0-1) +- `temporal_consistency_score` - Frame consistency (0-1) +- `camera_motion` - Camera movement type (pan, zoom, static, etc.) +- `fps`, `seconds`, `frame` - Video properties + +## Usage Examples + +### 1. Browse Metadata quickly (Fast - No Video Loading) + +```python +# Load only metadata without heavy video blobs +scanner = ds.scanner( + columns=["caption", "aesthetic_score", "motion_score"], + limit=10 +) +videos = scanner.to_table().to_pylist() + +for video in videos: + print(f"{video['caption']} - Quality: {video['aesthetic_score']:.2f}") +``` + +### 2. Export Videos from Blobs + +```python +# Load specific videos by index +indices = [0, 100, 500] +blob_files = ds.take_blobs("video_blob", ids=indices) + +# Save to disk +for i, blob_file in enumerate(blob_files): + with open(f"video_{i}.mp4", "wb") as f: + f.write(blob_file.read()) +``` + +### 3. Open inline videos with PyAV and run seeks directly on the blob file + +```python +import av + +selected_index = 123 +blob_file = ds.take_blobs("video_blob", ids=[selected_index])[0] + +with av.open(blob_file) as container: + stream = container.streams.video[0] + + for seconds in (0.0, 1.0, 2.5): + target_pts = int(seconds / stream.time_base) + container.seek(target_pts, stream=stream) + + frame = None + for candidate in container.decode(stream): + if candidate.time is None: + continue + frame = candidate + if frame.time >= seconds: + break + + print( + f"Seek {seconds:.1f}s -> {frame.width}x{frame.height} " + f"(pts={frame.pts}, time={frame.time:.2f}s)" + ) +``` + +### 3.5. Inspecting Existing Indices + +You can inspect the prebuilt indices on the dataset: + +```python +import lance + +# Open the dataset +dataset = lance.dataset("hf://datasets/lance-format/openvid-lance/data/train.lance") + +# List all indices +indices = dataset.list_indices() +print(indices) +``` + +While this dataset comes with pre-built indices, you can also create your own custom indices if needed. For example: + +```python +# ds is a local Lance dataset +ds.create_index( + "embedding", + index_type="IVF_PQ", + num_partitions=256, + num_sub_vectors=96, + replace=True, +) +``` + +### 4. Vector Similarity Search + +```python +import pyarrow as pa + +# Find similar videos +ref_video = ds.take([0], columns=["embedding"]).to_pylist()[0] +query_vector = pa.array([ref_video['embedding']], type=pa.list_(pa.float32(), 1024)) + +results = ds.scanner( + nearest={ + "column": "embedding", + "q": query_vector[0], + "k": 5, + "nprobes": 1, + "refine_factor": 1 + } +).to_table().to_pylist() + +for video in results[1:]: # Skip first (query itself) + print(video['caption']) +``` + +### LanceDB Vector Similarity Search + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/openvid-lance/data") +tbl = db.open_table("train") + +# Get a video to use as a query +ref_video = tbl.limit(1).select(["embedding", "caption"]).to_pandas().to_dict('records')[0] +query_embedding = ref_video["embedding"] + +results = tbl.search(query_embedding) \ + .metric("L2") \ + .nprobes(1) \ + .limit(5) \ + .to_list() + +for video in results[1:]: # Skip first (query itself) + print(f"{video['caption'][:60]}...") +``` + +### 5. Full-Text Search + +```python +# Search captions using FTS index +results = ds.scanner( + full_text_query="sunset beach", + columns=["caption", "aesthetic_score"], + limit=10, + fast_search=True +).to_table().to_pylist() + +for video in results: + print(f"{video['caption']} - {video['aesthetic_score']:.2f}") +``` + +### LanceDB Full-Text Search + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/openvid-lance/data") +tbl = db.open_table("train") + +results = tbl.search("sunset beach") \ + .select(["caption", "aesthetic_score"]) \ + .limit(10) \ + .to_list() + +for video in results: + print(f"{video['caption']} - {video['aesthetic_score']:.2f}") +``` + +### 6. Filter by Quality + +```python +# Get high-quality videos +high_quality = ds.scanner( + filter="aesthetic_score >= 4.5 AND motion_score >= 0.3", + columns=["caption", "aesthetic_score", "camera_motion"], + limit=20 +).to_table().to_pylist() +``` + +## Dataset Evolution + +Lance supports flexible schema and data evolution ([docs](https://lance.org/guide/data_evolution/?h=evol)). You can add/drop columns, backfill with SQL or Python, rename fields, or change data types without rewriting the whole dataset. In practice this lets you: +- Introduce fresh metadata (moderation labels, embeddings, quality scores) as new signals become available. +- Add new columns to existing datasets without re-exporting terabytes of video. +- Adjust column names or shrink storage (e.g., cast embeddings to float16) while keeping previous snapshots queryable for reproducibility. + +```python +import lance +import pyarrow as pa +import numpy as np + +base = pa.table({"id": pa.array([1, 2, 3])}) +dataset = lance.write_dataset(base, "openvid_evolution", mode="overwrite") + +# 1. Grow the schema instantly (metadata-only) +dataset.add_columns(pa.field("quality_bucket", pa.string())) + +# 2. Backfill with SQL expressions or constants +dataset.add_columns({"status": "'active'"}) + +# 3. Generate rich columns via Python batch UDFs +@lance.batch_udf() +def random_embedding(batch): + arr = np.random.rand(batch.num_rows, 128).astype("float32") + return pa.RecordBatch.from_arrays( + [pa.FixedSizeListArray.from_arrays(arr.ravel(), 128)], + names=["embedding"], + ) + +dataset.add_columns(random_embedding) + +# 4. Bring in offline annotations with merge +labels = pa.table({ + "id": pa.array([1, 2, 3]), + "label": pa.array(["horse", "rabbit", "cat"]), +}) +dataset.merge(labels, "id") + +# 5. Rename or cast columns as needs change +dataset.alter_columns({"path": "quality_bucket", "name": "quality_tier"}) +dataset.alter_columns({"path": "embedding", "data_type": pa.list_(pa.float16(), 128)}) +``` + +These operations are automatically versioned, so prior experiments can still point to earlier versions while OpenVid keeps evolving. + + + +## Citation + +```bibtex +@article{nan2024openvid, + title={OpenVid-1M: A Large-Scale High-Quality Dataset for Text-to-video Generation}, + author={Nan, Kepan and Xie, Rui and Zhou, Penghao and Fan, Tiehan and Yang, Zhenheng and Chen, Zhijie and Li, Xiang and Yang, Jian and Tai, Ying}, + journal={arXiv preprint arXiv:2407.02371}, + year={2024} +} +``` + + +## License + +Please check the original OpenVid dataset license for usage terms. diff --git a/docs/datasets/oxford-pets.mdx b/docs/datasets/oxford-pets.mdx new file mode 100644 index 00000000..e79c2262 --- /dev/null +++ b/docs/datasets/oxford-pets.mdx @@ -0,0 +1,119 @@ +--- +title: "Oxford-IIIT Pet" +sidebarTitle: "Oxford-IIIT Pet" +description: "Lance-formatted version of the Oxford-IIIT Pet dataset — 7,390 cat & dog photos across 37 breeds — sourced from pcuenq/oxford-pets." +--- + + +Source dataset card and downloadable files for `lance-format/oxford-pets-lance`. + + +Lance-formatted version of the [Oxford-IIIT Pet dataset](https://www.robots.ox.ac.uk/~vgg/data/pets/) — 7,390 cat & dog photos across 37 breeds — sourced from [`pcuenq/oxford-pets`](https://huggingface.co/datasets/pcuenq/oxford-pets). + +## Schema + +| Column | Type | Notes | +|---|---|---| +| `id` | `int64` | Row index | +| `image` | `large_binary` | Inline JPEG bytes | +| `label_name` | `string` | One of 37 breeds, underscore-spaced (`british_shorthair`, `golden_retriever`, …) | +| `is_dog` | `bool` | `true` for dog breeds, `false` for cat breeds | +| `path` | `string?` | Original filename in the source dataset | +| `image_emb` | `fixed_size_list` | OpenCLIP `ViT-B-32` embedding (cosine-normalized) | + +## Pre-built indices + +- `IVF_PQ` on `image_emb` — `metric=cosine` +- `BITMAP` on `label_name` and `is_dog` + +## Quick start + +```python +import lance +ds = lance.dataset("hf://datasets/lance-format/oxford-pets-lance/data/train.lance") +print(ds.count_rows(), ds.schema.names, ds.list_indices()) +``` + +## Load with LanceDB + +These tables can also be consumed by [LanceDB](https://lancedb.github.io/lancedb/), the multimodal lakehouse and embedded search library built on top of Lance, for simplified vector search and other queries. + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/oxford-pets-lance/data") +tbl = db.open_table("train") +print(f"LanceDB table opened with {len(tbl)} images") +``` + +## Filter — only dogs, only golden retrievers, etc. + +```python +import lance +ds = lance.dataset("hf://datasets/lance-format/oxford-pets-lance/data/train.lance") +dogs = ds.scanner(filter="is_dog = true", columns=["label_name"], limit=5).to_table() +goldens = ds.scanner(filter="label_name = 'golden_retriever'", columns=["id"], limit=5).to_table() +``` + +### Filter with LanceDB + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/oxford-pets-lance/data") +tbl = db.open_table("train") +dogs = tbl.search().where("is_dog = true").select(["label_name"]).limit(5).to_list() +goldens = tbl.search().where("label_name = 'golden_retriever'").select(["id"]).limit(5).to_list() +``` + +## Visual similarity search + +```python +import lance, pyarrow as pa +ds = lance.dataset("hf://datasets/lance-format/oxford-pets-lance/data/train.lance") +emb_field = ds.schema.field("image_emb") +ref = ds.take([0], columns=["image_emb", "label_name"]).to_pylist()[0] +neighbors = ds.scanner( + nearest={"column": "image_emb", "q": pa.array([ref["image_emb"]], type=emb_field.type)[0], "k": 5}, + columns=["id", "label_name"], +).to_table().to_pylist() +``` + +### LanceDB visual similarity search + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/oxford-pets-lance/data") +tbl = db.open_table("train") + +ref = tbl.search().limit(1).select(["image_emb", "label_name"]).to_list()[0] +query_embedding = ref["image_emb"] + +results = ( + tbl.search(query_embedding) + .metric("cosine") + .select(["id", "label_name"]) + .limit(5) + .to_list() +) +``` + +## Source & license + +Converted from [`pcuenq/oxford-pets`](https://huggingface.co/datasets/pcuenq/oxford-pets). Released under [CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/). + +## Citation + +``` +@inproceedings{parkhi2012cats, + title={Cats and Dogs}, + author={Parkhi, Omkar M. and Vedaldi, Andrea and Zisserman, Andrew and Jawahar, C. V.}, + booktitle={IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + year={2012} +} +``` diff --git a/docs/datasets/pascal-voc-2012-segmentation.mdx b/docs/datasets/pascal-voc-2012-segmentation.mdx new file mode 100644 index 00000000..eb907849 --- /dev/null +++ b/docs/datasets/pascal-voc-2012-segmentation.mdx @@ -0,0 +1,143 @@ +--- +title: "Pascal VOC 2012 Segmentation" +sidebarTitle: "Pascal VOC 2012 Segmentation" +description: "A Lance-formatted version of the Pascal VOC 2012 semantic segmentation split (sourced from nateraw/pascal-voc-2012) — 2,913 image / mask pairs with CLIP image embeddings stored inline and a pre-built IVF_PQ ANN index." +--- + + +Source dataset card and downloadable files for `lance-format/pascal-voc-2012-segmentation-lance`. + + +A Lance-formatted version of the [Pascal VOC 2012 semantic segmentation split](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/) (sourced from [`nateraw/pascal-voc-2012`](https://huggingface.co/datasets/nateraw/pascal-voc-2012)) — **2,913 image / mask pairs** with CLIP image embeddings stored inline and a pre-built `IVF_PQ` ANN index. + +## Why segmentation? + +VOC 2012 ships several tasks (classification, detection, segmentation, action). We focus on the **semantic segmentation** subset because every row carries a paired mask image and the dataset is small enough to convert quickly with full embeddings — useful as a smoke test or a small benchmark. + +## Splits + +| Split | Rows | +|-------|------| +| `train.lance` | 1,464 | +| `validation.lance` | 1,449 | + +## Schema + +| Column | Type | Notes | +|---|---|---| +| `id` | `int64` | Row index within the split | +| `image` | `large_binary` | Inline JPEG bytes | +| `mask` | `large_binary` | Inline PNG bytes — class id per pixel (0=background, 1-20=VOC classes, 255=void) | +| `image_emb` | `fixed_size_list` | OpenCLIP `ViT-B-32` image embedding (cosine-normalized) | + +The 20 Pascal VOC classes are: `aeroplane`, `bicycle`, `bird`, `boat`, `bottle`, `bus`, `car`, `cat`, `chair`, `cow`, `diningtable`, `dog`, `horse`, `motorbike`, `person`, `pottedplant`, `sheep`, `sofa`, `train`, `tvmonitor`. + +## Pre-built indices + +- `IVF_PQ` on `image_emb` — `metric=cosine` + +> Note: the small dataset size (≤1,464 rows per split) is below Lance's +> default partition count, so the helper falls back to a smaller +> `num_partitions` automatically. For higher recall, build the index with +> `num_partitions=16` against a local copy. + +## Quick start + +```python +import lance + +ds = lance.dataset("hf://datasets/lance-format/pascal-voc-2012-segmentation-lance/data/train.lance") +print(ds.count_rows(), ds.schema.names, ds.list_indices()) +``` + +## Load with LanceDB + +These tables can also be consumed by [LanceDB](https://lancedb.github.io/lancedb/), the multimodal lakehouse and embedded search library built on top of Lance, for simplified vector search and other queries. + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/pascal-voc-2012-segmentation-lance/data") +tbl = db.open_table("train") +print(f"LanceDB table opened with {len(tbl)} image-mask pairs") +``` + +## Working with images and masks + +```python +from pathlib import Path +import lance +from PIL import Image +import io + +ds = lance.dataset("hf://datasets/lance-format/pascal-voc-2012-segmentation-lance/data/train.lance") +row = ds.take([0], columns=["image", "mask"]).to_pylist()[0] +Path("img.jpg").write_bytes(row["image"]) +Path("mask.png").write_bytes(row["mask"]) + +import numpy as np +mask = np.array(Image.open(io.BytesIO(row["mask"]))) +print("classes present:", np.unique(mask).tolist()) +``` + +## Vector search example + +```python +import lance +import pyarrow as pa + +ds = lance.dataset("hf://datasets/lance-format/pascal-voc-2012-segmentation-lance/data/train.lance") +emb_field = ds.schema.field("image_emb") +ref = ds.take([0], columns=["image_emb"]).to_pylist()[0]["image_emb"] +query = pa.array([ref], type=emb_field.type) + +neighbors = ds.scanner( + nearest={"column": "image_emb", "q": query[0], "k": 5}, + columns=["id"], +).to_table().to_pylist() +``` + +### LanceDB vector search + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/pascal-voc-2012-segmentation-lance/data") +tbl = db.open_table("train") + +ref = tbl.search().limit(1).select(["image_emb"]).to_list()[0] +query_embedding = ref["image_emb"] + +results = ( + tbl.search(query_embedding) + .metric("cosine") + .select(["id"]) + .limit(5) + .to_list() +) +``` + +## Why Lance? + +- One dataset carries images + masks + embeddings + indices — no sidecar files. +- On-disk vector and full-text indices live next to the data, so search works on local copies and on the Hub. +- Schema evolution: add columns (instance masks, alternate embeddings, model predictions) without rewriting the data. + +## Source & license + +Converted from [`nateraw/pascal-voc-2012`](https://huggingface.co/datasets/nateraw/pascal-voc-2012). The Pascal VOC dataset is released under [its own custom license](http://host.robots.ox.ac.uk/pascal/VOC/) — please review before redistribution. + +## Citation + +``` +@misc{everingham2012pascal, + title={The Pascal Visual Object Classes Challenge: A Retrospective}, + author={Everingham, Mark and Eslami, S. M. Ali and Van Gool, Luc and Williams, Christopher K. I. and Winn, John and Zisserman, Andrew}, + journal={International Journal of Computer Vision}, + year={2015} +} +``` diff --git a/docs/datasets/squad-v2.mdx b/docs/datasets/squad-v2.mdx new file mode 100644 index 00000000..6e9ee6fe --- /dev/null +++ b/docs/datasets/squad-v2.mdx @@ -0,0 +1,182 @@ +--- +title: "SQuAD v2" +sidebarTitle: "SQuAD v2" +description: "Lance-formatted version of SQuAD v2 — Stanford Question Answering Dataset, version 2 — with MiniLM sentence embeddings stored inline alongside the questions, contexts, and answers." +--- + + +Source dataset card and downloadable files for `lance-format/squad-v2-lance`. + + +Lance-formatted version of [SQuAD v2](https://huggingface.co/datasets/rajpurkar/squad_v2) — Stanford Question Answering Dataset, version 2 — with **MiniLM sentence embeddings** stored inline alongside the questions, contexts, and answers. + +## Why this version? + +- **One self-contained Lance dataset** with 130k+ Wikipedia-grounded questions and reference answers. +- **Pre-computed text embeddings** (`sentence-transformers/all-MiniLM-L6-v2`, 384-dim, L2-normalized) on the question column with an `IVF_PQ` index — instant semantic question retrieval. +- **Full-text inverted indices** on both `question` and `context` for keyword search. +- **BITMAP** on `is_impossible` for fast filtering between answerable and unanswerable questions. + +## Splits + +| Split | Rows | +|-------|------| +| `train.lance` | 130,319 | +| `validation.lance` | 11,873 | + +## Schema + +| Column | Type | Notes | +|---|---|---| +| `id` | `string` | SQuAD question id | +| `title` | `string` | Wikipedia article title | +| `context` | `string` | Paragraph the question was generated from | +| `question` | `string` | The question text | +| `answers` | `list` | Accepted answer spans (empty for impossible questions) | +| `answer_starts` | `list` | Character offsets of each answer within `context` | +| `is_impossible` | `bool` | `true` for SQuAD 2.0 unanswerable questions | +| `question_emb` | `fixed_size_list` | MiniLM embedding of `question` (cosine-normalized) | + +## Pre-built indices + +- `IVF_PQ` on `question_emb` — `metric=cosine` +- `INVERTED` on `question` and `context` +- `BTREE` on `id` and `title` +- `BITMAP` on `is_impossible` + +## Quick start + +```python +import lance + +ds = lance.dataset("hf://datasets/lance-format/squad-v2-lance/data/validation.lance") +print(ds.count_rows(), ds.schema.names, ds.list_indices()) +``` + +## Load with LanceDB + +These tables can also be consumed by [LanceDB](https://lancedb.github.io/lancedb/), the multimodal lakehouse and embedded search library built on top of Lance, for simplified vector search and other queries. + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/squad-v2-lance/data") +tbl = db.open_table("validation") +print(f"LanceDB table opened with {len(tbl)} questions") +``` + +## Semantic question retrieval + +```python +import lance +import pyarrow as pa +from sentence_transformers import SentenceTransformer + +encoder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device="cuda") +q_vec = encoder.encode(["what year was the eiffel tower built?"], normalize_embeddings=True)[0] + +ds = lance.dataset("hf://datasets/lance-format/squad-v2-lance/data/train.lance") +emb_field = ds.schema.field("question_emb") +query = pa.array([q_vec.tolist()], type=emb_field.type) + +hits = ds.scanner( + nearest={"column": "question_emb", "q": query[0], "k": 10, "nprobes": 16, "refine_factor": 30}, + columns=["id", "title", "question", "answers"], +).to_table().to_pylist() +``` + +### LanceDB semantic question retrieval + +```python +import lancedb +from sentence_transformers import SentenceTransformer + +encoder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device="cuda") +q_vec = encoder.encode(["what year was the eiffel tower built?"], normalize_embeddings=True)[0] + +db = lancedb.connect("hf://datasets/lance-format/squad-v2-lance/data") +tbl = db.open_table("train") + +results = ( + tbl.search(q_vec.tolist(), vector_column_name="question_emb") + .metric("cosine") + .select(["id", "title", "question", "answers"]) + .limit(10) + .to_list() +) +``` + +## Full-text search on contexts + +```python +ds = lance.dataset("hf://datasets/lance-format/squad-v2-lance/data/train.lance") +hits = ds.scanner( + full_text_query="great pyramid of giza", + columns=["title", "question", "context"], + limit=5, +).to_table().to_pylist() +``` + +### LanceDB full-text search + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/squad-v2-lance/data") +tbl = db.open_table("train") + +results = ( + tbl.search("great pyramid of giza") + .select(["title", "question", "context"]) + .limit(5) + .to_list() +) +``` + +## Filter answerable vs impossible questions + +```python +ds = lance.dataset("hf://datasets/lance-format/squad-v2-lance/data/validation.lance") +impossible = ds.scanner(filter="is_impossible = true", columns=["question"], limit=5).to_table() +``` + +### Filter with LanceDB + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/squad-v2-lance/data") +tbl = db.open_table("validation") +impossible = ( + tbl.search() + .where("is_impossible = true") + .select(["question"]) + .limit(5) + .to_list() +) +``` + +## Why Lance? + +- One dataset carries questions + contexts + answers + embeddings + indices — no sidecar files. +- On-disk vector and full-text indices live next to the data, so search works on local copies and on the Hub. +- Schema evolution: add columns (alternate embeddings, model predictions, task labels) without rewriting the data. + +## Source & license + +Converted from [`rajpurkar/squad_v2`](https://huggingface.co/datasets/rajpurkar/squad_v2). SQuAD v2 is released under [CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/). + +## Citation + +``` +@article{rajpurkar2018know, + title={Know What You Don't Know: Unanswerable Questions for SQuAD}, + author={Rajpurkar, Pranav and Jia, Robin and Liang, Percy}, + journal={Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Short Papers)}, + year={2018}, +} +``` diff --git a/docs/datasets/stanford-cars.mdx b/docs/datasets/stanford-cars.mdx new file mode 100644 index 00000000..7701095c --- /dev/null +++ b/docs/datasets/stanford-cars.mdx @@ -0,0 +1,123 @@ +--- +title: "Stanford Cars" +sidebarTitle: "Stanford Cars" +description: "Lance-formatted version of the Stanford Cars dataset — 8,144 training images across 196 fine-grained car make/model/year classes — sourced from Multimodal-Fatima/StanfordCars_train." +--- + + +Source dataset card and downloadable files for `lance-format/stanford-cars-lance`. + + +Lance-formatted version of the [Stanford Cars dataset](https://web.archive.org/web/20210212183835/http://ai.stanford.edu/~jkrause/cars/car_dataset.html) — 8,144 training images across 196 fine-grained car make/model/year classes — sourced from [`Multimodal-Fatima/StanfordCars_train`](https://huggingface.co/datasets/Multimodal-Fatima/StanfordCars_train). + +## Schema + +| Column | Type | Notes | +|---|---|---| +| `id` | `int64` | Row index | +| `image` | `large_binary` | Inline JPEG bytes | +| `label` | `int32` | Class id (0-195) | +| `blip_caption` | `string?` | BLIP-generated caption (beam=5) carried through from the source mirror | +| `image_emb` | `fixed_size_list` | OpenCLIP `ViT-B-32` embedding (cosine-normalized) | + +## Pre-built indices + +- `IVF_PQ` on `image_emb` — `metric=cosine` +- `INVERTED` (FTS) on `blip_caption` +- `BTREE` on `label` + +## Quick start + +```python +import lance +ds = lance.dataset("hf://datasets/lance-format/stanford-cars-lance/data/train.lance") +print(ds.count_rows(), ds.schema.names, ds.list_indices()) +``` + +## Load with LanceDB + +These tables can also be consumed by [LanceDB](https://lancedb.github.io/lancedb/), the multimodal lakehouse and embedded search library built on top of Lance, for simplified vector search and other queries. + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/stanford-cars-lance/data") +tbl = db.open_table("train") +print(f"LanceDB table opened with {len(tbl)} car images") +``` + +## Caption-based filtering + +```python +import lance +ds = lance.dataset("hf://datasets/lance-format/stanford-cars-lance/data/train.lance") +hits = ds.scanner(full_text_query="red sports car", columns=["id", "blip_caption"], limit=10).to_table() +``` + +### LanceDB full-text search + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/stanford-cars-lance/data") +tbl = db.open_table("train") + +results = ( + tbl.search("red sports car") + .select(["id", "blip_caption"]) + .limit(10) + .to_list() +) +``` + +## Visual similarity search + +```python +import lance, pyarrow as pa +ds = lance.dataset("hf://datasets/lance-format/stanford-cars-lance/data/train.lance") +emb_field = ds.schema.field("image_emb") +ref = ds.take([0], columns=["image_emb", "blip_caption"]).to_pylist()[0] +neighbors = ds.scanner( + nearest={"column": "image_emb", "q": pa.array([ref["image_emb"]], type=emb_field.type)[0], "k": 5}, + columns=["id", "blip_caption"], +).to_table().to_pylist() +``` + +### LanceDB visual similarity search + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/stanford-cars-lance/data") +tbl = db.open_table("train") + +ref = tbl.search().limit(1).select(["image_emb", "blip_caption"]).to_list()[0] +query_embedding = ref["image_emb"] + +results = ( + tbl.search(query_embedding) + .metric("cosine") + .select(["id", "blip_caption"]) + .limit(5) + .to_list() +) +``` + +## Source & license + +Converted from [`Multimodal-Fatima/StanfordCars_train`](https://huggingface.co/datasets/Multimodal-Fatima/StanfordCars_train), itself a parquet redistribution of the Stanford Cars test split. The original dataset license is for non-commercial research use; review the [Stanford Cars terms](https://github.com/jhoffman/stanford-cars) before redistribution. + +## Citation + +``` +@inproceedings{krause2013collecting, + title={Collecting a large-scale dataset of fine-grained cars}, + author={Krause, Jonathan and Stark, Michael and Deng, Jia and Fei-Fei, Li}, + booktitle={Workshop on Fine-Grained Visual Categorization (CVPR)}, + year={2013} +} +``` diff --git a/docs/datasets/textvqa.mdx b/docs/datasets/textvqa.mdx new file mode 100644 index 00000000..f3f1e0c1 --- /dev/null +++ b/docs/datasets/textvqa.mdx @@ -0,0 +1,148 @@ +--- +title: "TextVQA" +sidebarTitle: "TextVQA" +description: "Lance-formatted version of TextVQA — VQA where the question requires reading text in the image — sourced from lmms-lab/textvqa." +--- + + +Source dataset card and downloadable files for `lance-format/textvqa-lance`. + + +Lance-formatted version of [TextVQA](https://textvqa.org/) — VQA where the question requires *reading* text in the image — sourced from [`lmms-lab/textvqa`](https://huggingface.co/datasets/lmms-lab/textvqa). + +Each row carries the image bytes, the question, the 10 reference answers, the OCR tokens detected by the dataset's pre-processing, and CLIP image + question embeddings. + +## Splits + +| Split | Rows | +|-------|------| +| `validation.lance` | 5,000 | +| `train.lance` | 34,602 | + +## Schema + +| Column | Type | Notes | +|---|---|---| +| `id` | `int64` | Row index within split | +| `image` | `large_binary` | Inline JPEG bytes | +| `image_id` | `string?` | TextVQA image id | +| `question_id` | `string?` | TextVQA question id | +| `question` | `string` | The question text | +| `answers` | `list` | 10 annotator answers | +| `answer` | `string` | First answer — used as canonical / FTS target | +| `ocr_tokens` | `list` | OCR tokens detected on the image | +| `image_classes` | `list` | OpenImages-style scene tags from the source | +| `set_name` | `string?` | Source partition (`train`, `val`) | +| `image_emb` | `fixed_size_list` | OpenCLIP image embedding (cosine-normalized) | +| `question_emb` | `fixed_size_list` | OpenCLIP text embedding of the question | + +## Pre-built indices + +- `IVF_PQ` on `image_emb` and `question_emb` — `metric=cosine` +- `INVERTED` (FTS) on `question` and `answer` +- `BTREE` on `image_id`, `question_id`, `set_name` + +## Quick start + +```python +import lance +ds = lance.dataset("hf://datasets/lance-format/textvqa-lance/data/validation.lance") +print(ds.count_rows(), ds.schema.names, ds.list_indices()) +``` + +## Load with LanceDB + +These tables can also be consumed by [LanceDB](https://lancedb.github.io/lancedb/), the multimodal lakehouse and embedded search library built on top of Lance, for simplified vector search and other queries. + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/textvqa-lance/data") +tbl = db.open_table("validation") +print(f"LanceDB table opened with {len(tbl)} image-question pairs") +``` + +## Cross-modal text→image search + +```python +import lance, pyarrow as pa, open_clip, torch + +model, _, _ = open_clip.create_model_and_transforms("ViT-B-32", pretrained="laion2b_s34b_b79k") +tokenizer = open_clip.get_tokenizer("ViT-B-32") +model = model.eval().cuda().half() +with torch.no_grad(): + q = model.encode_text(tokenizer(["what brand is on this billboard?"]).cuda()) + q = (q / q.norm(dim=-1, keepdim=True)).float().cpu().numpy()[0] + +ds = lance.dataset("hf://datasets/lance-format/textvqa-lance/data/validation.lance") +emb_field = ds.schema.field("image_emb") +hits = ds.scanner( + nearest={"column": "image_emb", "q": pa.array([q.tolist()], type=emb_field.type)[0], "k": 10}, + columns=["question", "answer", "ocr_tokens"], +).to_table().to_pylist() +``` + +### LanceDB cross-modal text→image search + +```python +import lancedb, open_clip, torch + +model, _, _ = open_clip.create_model_and_transforms("ViT-B-32", pretrained="laion2b_s34b_b79k") +tokenizer = open_clip.get_tokenizer("ViT-B-32") +model = model.eval().cuda().half() +with torch.no_grad(): + q = model.encode_text(tokenizer(["what brand is on this billboard?"]).cuda()) + q = (q / q.norm(dim=-1, keepdim=True)).float().cpu().numpy()[0] + +db = lancedb.connect("hf://datasets/lance-format/textvqa-lance/data") +tbl = db.open_table("validation") + +results = ( + tbl.search(q.tolist(), vector_column_name="image_emb") + .metric("cosine") + .select(["question", "answer", "ocr_tokens"]) + .limit(10) + .to_list() +) +``` + +### LanceDB full-text search + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/textvqa-lance/data") +tbl = db.open_table("validation") + +results = ( + tbl.search("brand name") + .select(["question", "answer"]) + .limit(10) + .to_list() +) +``` + +## Why Lance? + +- One dataset for images + questions + answers + OCR + dual embeddings + indices — no JSON/feature folders. +- Cross-modal search and OCR-text filtering work on the same dataset on the Hub. +- Schema evolution: add columns (alternate OCR systems, model predictions) without rewriting the data. + +## Source & license + +Converted from [`lmms-lab/textvqa`](https://huggingface.co/datasets/lmms-lab/textvqa). TextVQA is released under [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/) by Singh et al. (Facebook AI Research). + +## Citation + +``` +@inproceedings{singh2019towards, + title={Towards VQA models that can read}, + author={Singh, Amanpreet and Natarajan, Vivek and Shah, Meet and Jiang, Yu and Chen, Xinlei and Batra, Dhruv and Parikh, Devi and Rohrbach, Marcus}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + year={2019} +} +``` diff --git a/docs/datasets/trivia-qa.mdx b/docs/datasets/trivia-qa.mdx new file mode 100644 index 00000000..4e5024e8 --- /dev/null +++ b/docs/datasets/trivia-qa.mdx @@ -0,0 +1,166 @@ +--- +title: "TriviaQA" +sidebarTitle: "TriviaQA" +description: "Lance-formatted version of TriviaQA (rc.nocontext config) — a question-answering dataset of trivia questions paired with answer aliases — with MiniLM sentence embeddings stored inline." +--- + + +Source dataset card and downloadable files for `lance-format/trivia-qa-lance`. + + +Lance-formatted version of [TriviaQA](https://nlp.cs.washington.edu/triviaqa/) (`rc.nocontext` config) — a question-answering dataset of trivia questions paired with answer aliases — with **MiniLM sentence embeddings** stored inline. + +## Why `rc.nocontext`? + +The full TriviaQA dataset bundles entire Wikipedia / web pages per question (`entity_pages`, `search_results`), which makes it tens of GB. The `rc.nocontext` slice keeps the question + answer + answer aliases in a compact form — ideal for closed-book QA, retrieval research, and as a search target. + +## Splits + +| Split | Rows | +|-------|------| +| `train.lance` | 138,384 | +| `validation.lance` | 17,944 | + +## Schema + +| Column | Type | Notes | +|---|---|---| +| `question_id` | `string` | TriviaQA question id (e.g. `tc_1`) | +| `question` | `string` | The trivia question | +| `question_source` | `string` | URL / source where the question came from | +| `answer_value` | `string` | Canonical answer | +| `answer_aliases` | `list` | Other accepted phrasings (e.g. `["Sinclair Lewis", "Harry Sinclair Lewis"]`) | +| `normalized_answer` | `string` | Lowercased / normalized form for exact-match scoring | +| `answer_type` | `string` | TriviaQA entity type (e.g. `WikipediaEntity`, `FreebaseEntity`) | +| `question_emb` | `fixed_size_list` | MiniLM embedding of `question` (cosine-normalized) | + +## Pre-built indices + +- `IVF_PQ` on `question_emb` — `metric=cosine` +- `INVERTED` on `question` +- `BTREE` on `question_id` and `answer_value` +- `BITMAP` on `answer_type` + +## Quick start + +```python +import lance + +ds = lance.dataset("hf://datasets/lance-format/trivia-qa-lance/data/train.lance") +print(ds.count_rows(), ds.schema.names, ds.list_indices()) +``` + +## Load with LanceDB + +These tables can also be consumed by [LanceDB](https://lancedb.github.io/lancedb/), the multimodal lakehouse and embedded search library built on top of Lance, for simplified vector search and other queries. + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/trivia-qa-lance/data") +tbl = db.open_table("train") +print(f"LanceDB table opened with {len(tbl)} trivia questions") +``` + +## Semantic search over questions + +```python +import lance +import pyarrow as pa +from sentence_transformers import SentenceTransformer + +encoder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device="cuda") +q = encoder.encode(["who painted the sistine chapel ceiling"], normalize_embeddings=True)[0] + +ds = lance.dataset("hf://datasets/lance-format/trivia-qa-lance/data/train.lance") +emb_field = ds.schema.field("question_emb") +hits = ds.scanner( + nearest={"column": "question_emb", "q": pa.array([q.tolist()], type=emb_field.type)[0], "k": 5}, + columns=["question", "answer_value", "answer_aliases"], +).to_table().to_pylist() +``` + +### LanceDB semantic search + +```python +import lancedb +from sentence_transformers import SentenceTransformer + +encoder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device="cuda") +q = encoder.encode(["who painted the sistine chapel ceiling"], normalize_embeddings=True)[0] + +db = lancedb.connect("hf://datasets/lance-format/trivia-qa-lance/data") +tbl = db.open_table("train") + +results = ( + tbl.search(q.tolist(), vector_column_name="question_emb") + .metric("cosine") + .select(["question", "answer_value", "answer_aliases"]) + .limit(5) + .to_list() +) +``` + +### LanceDB full-text search + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/trivia-qa-lance/data") +tbl = db.open_table("train") + +results = ( + tbl.search("sistine chapel") + .select(["question", "answer_value"]) + .limit(10) + .to_list() +) +``` + +## Filter by answer type + +```python +ds = lance.dataset("hf://datasets/lance-format/trivia-qa-lance/data/train.lance") +wiki = ds.scanner(filter="answer_type = 'WikipediaEntity'", columns=["question"], limit=5).to_table() +``` + +### Filter with LanceDB + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/trivia-qa-lance/data") +tbl = db.open_table("train") +wiki = ( + tbl.search() + .where("answer_type = 'WikipediaEntity'") + .select(["question"]) + .limit(5) + .to_list() +) +``` + +## Why Lance? + +- One dataset carries questions + answers + aliases + embeddings + indices — no sidecar files. +- On-disk vector and full-text indices live next to the data, so search works on local copies and on the Hub. +- Schema evolution: add columns (alternate embeddings, generated answers, task labels) without rewriting the data. + +## Source & license + +Converted from [`mandarjoshi/trivia_qa`](https://huggingface.co/datasets/mandarjoshi/trivia_qa) (`rc.nocontext`). TriviaQA is released under the Apache 2.0 license. + +## Citation + +``` +@article{joshi2017triviaqa, + title={TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension}, + author={Joshi, Mandar and Choi, Eunsol and Weld, Daniel S and Zettlemoyer, Luke}, + journal={arXiv preprint arXiv:1705.03551}, + year={2017} +} +``` diff --git a/docs/datasets/vqav2.mdx b/docs/datasets/vqav2.mdx new file mode 100644 index 00000000..09bdd60b --- /dev/null +++ b/docs/datasets/vqav2.mdx @@ -0,0 +1,210 @@ +--- +title: "VQAv2" +sidebarTitle: "VQAv2" +description: "Lance-formatted version of VQAv2 — Visual Question Answering on COCO images, sourced from lmms-lab/VQAv2. Each row is a (image, question, 10 answers) triple with two CLIP embeddings (image + question text) so the same dataset supports both visual…" +--- + + +Source dataset card and downloadable files for `lance-format/vqav2-lance`. + + +Lance-formatted version of [VQAv2](https://visualqa.org/) — Visual Question Answering on COCO images, sourced from [`lmms-lab/VQAv2`](https://huggingface.co/datasets/lmms-lab/VQAv2). Each row is a `(image, question, 10 answers)` triple with **two** CLIP embeddings (image + question text) so the same dataset supports both visual retrieval and question-similarity retrieval. + +## Splits + +| Split | Rows | +|-------|------| +| `validation.lance` | 214,354 | + +> **Train split note.** `lmms-lab/VQAv2` ships `train`, `validation`, `testdev`, +> and `test` parquet shards but only declares the eval splits in its +> `dataset_info`, so `datasets.load_dataset(..., split="train")` raises. The +> `vqav2/dataprep.py` script in this repo builds the validation split today; +> the train split (444k rows) can be enabled in a follow-up by reading the +> `data/train-*.parquet` shards directly with PyArrow or by switching to +> `Multimodal-Fatima/VQAv2_train`. Track progress in `TRACKED_DATASETS.md`. + +## Schema + +| Column | Type | Notes | +|---|---|---| +| `id` | `int64` | Row index within split | +| `image` | `large_binary` | Inline JPEG bytes | +| `image_id` | `int64` | COCO image id | +| `question_id` | `int64` | VQAv2 question id | +| `question` | `string` | Natural-language question | +| `question_type` | `string` | First few tokens of the question (e.g. `what is`, `is the`) | +| `answer_type` | `string` | One of `yes/no`, `number`, `other` | +| `multiple_choice_answer` | `string` | Canonical (most-common) answer | +| `answers` | `list` | Raw answers from 10 annotators | +| `answer_confidences` | `list` | Parallel confidence list (`yes` / `maybe` / `no`) | +| `image_emb` | `fixed_size_list` | OpenCLIP `ViT-B-32` image embedding (cosine-normalized) | +| `question_emb` | `fixed_size_list` | OpenCLIP `ViT-B-32` text embedding of the question (cosine-normalized) | + +Because both embeddings come from the same CLIP model, they share an embedding space and cross-modal retrieval (image→question or question→image) works out of the box. + +## Pre-built indices + +- `IVF_PQ` on `image_emb` and `question_emb` — `metric=cosine` +- `INVERTED` (FTS) on `question` +- `BTREE` on `image_id`, `question_id`, `multiple_choice_answer` +- `BITMAP` on `question_type`, `answer_type` + +## Quick start + +```python +import lance + +ds = lance.dataset("hf://datasets/lance-format/vqav2-lance/data/validation.lance") +print(ds.count_rows(), ds.schema.names, ds.list_indices()) +``` + +## Load with LanceDB + +These tables can also be consumed by [LanceDB](https://lancedb.github.io/lancedb/), the multimodal lakehouse and embedded search library built on top of Lance, for simplified vector search and other queries. + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/vqav2-lance/data") +tbl = db.open_table("validation") +print(f"LanceDB table opened with {len(tbl)} image-question pairs") +``` + +## Cross-modal: find an image for a free-form question + +```python +import lance +import pyarrow as pa +import open_clip +import torch + +model, _, _ = open_clip.create_model_and_transforms("ViT-B-32", pretrained="laion2b_s34b_b79k") +tokenizer = open_clip.get_tokenizer("ViT-B-32") +model = model.eval().cuda().half() +with torch.no_grad(): + q = model.encode_text(tokenizer(["what color is the dog?"]).cuda()) + q = (q / q.norm(dim=-1, keepdim=True)).float().cpu().numpy()[0] + +ds = lance.dataset("hf://datasets/lance-format/vqav2-lance/data/validation.lance") +emb_field = ds.schema.field("image_emb") +hits = ds.scanner( + nearest={"column": "image_emb", "q": pa.array([q.tolist()], type=emb_field.type)[0], "k": 5}, + columns=["image_id", "question", "multiple_choice_answer"], +).to_table().to_pylist() +``` + +### LanceDB cross-modal search + +```python +import lancedb, open_clip, torch + +model, _, _ = open_clip.create_model_and_transforms("ViT-B-32", pretrained="laion2b_s34b_b79k") +tokenizer = open_clip.get_tokenizer("ViT-B-32") +model = model.eval().cuda().half() +with torch.no_grad(): + q = model.encode_text(tokenizer(["what color is the dog?"]).cuda()) + q = (q / q.norm(dim=-1, keepdim=True)).float().cpu().numpy()[0] + +db = lancedb.connect("hf://datasets/lance-format/vqav2-lance/data") +tbl = db.open_table("validation") + +results = ( + tbl.search(q.tolist(), vector_column_name="image_emb") + .metric("cosine") + .select(["image_id", "question", "multiple_choice_answer"]) + .limit(5) + .to_list() +) +``` + +## Question similarity (text→text) + +```python +ds = lance.dataset("hf://datasets/lance-format/vqav2-lance/data/validation.lance") +ref = ds.take([0], columns=["question_emb", "question"]).to_pylist()[0] +emb_field = ds.schema.field("question_emb") +neighbors = ds.scanner( + nearest={"column": "question_emb", "q": pa.array([ref["question_emb"]], type=emb_field.type)[0], "k": 5}, + columns=["question", "multiple_choice_answer"], +).to_table().to_pylist() +print("query:", ref["question"]) +for n in neighbors: + print(n) +``` + +### LanceDB question similarity + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/vqav2-lance/data") +tbl = db.open_table("validation") + +ref = tbl.search().limit(1).select(["question_emb", "question"]).to_list()[0] +query_embedding = ref["question_emb"] + +results = ( + tbl.search(query_embedding, vector_column_name="question_emb") + .metric("cosine") + .select(["question", "multiple_choice_answer"]) + .limit(5) + .to_list() +) +``` + +## Filter by question / answer type + +```python +ds = lance.dataset("hf://datasets/lance-format/vqav2-lance/data/validation.lance") +yesno = ds.scanner(filter="answer_type = 'yes/no'", columns=["question", "multiple_choice_answer"], limit=5).to_table() +counts = ds.scanner(filter="answer_type = 'number'", columns=["question", "multiple_choice_answer"], limit=5).to_table() +``` + +### Filter with LanceDB + +```python +import lancedb + +db = lancedb.connect("hf://datasets/lance-format/vqav2-lance/data") +tbl = db.open_table("validation") +yesno = ( + tbl.search() + .where("answer_type = 'yes/no'") + .select(["question", "multiple_choice_answer"]) + .limit(5) + .to_list() +) +counts = ( + tbl.search() + .where("answer_type = 'number'") + .select(["question", "multiple_choice_answer"]) + .limit(5) + .to_list() +) +``` + +## Why Lance? + +- One dataset for images + questions + answers + dual embeddings + indices — no JSON/CSV sidecars. +- On-disk vector and FTS indices live next to the data, so search works on local copies and on the Hub. +- Schema evolution: add columns (alternate embeddings, model predictions, generated answers) without rewriting the data. + +## Source & license + +Converted from [`lmms-lab/VQAv2`](https://huggingface.co/datasets/lmms-lab/VQAv2). VQAv2 questions and annotations are released under [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/). The underlying images come from COCO and are subject to Flickr terms of service. See the [VQAv2 download page](https://visualqa.org/download.html) for details. + +## Citation + +``` +@inproceedings{goyal2017making, + title={Making the V in VQA Matter: Elevating the Role of Image Understanding in Visual Question Answering}, + author={Goyal, Yash and Khot, Tejas and Summers-Stay, Douglas and Batra, Dhruv and Parikh, Devi}, + booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + year={2017} +} +``` diff --git a/docs/docs.json b/docs/docs.json index fab2330a..0f69d23c 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -6,7 +6,7 @@ "theme": "mint", "name": "LanceDB", "banner": { - "content": "[Why Multimodal Data Needs a Better Lakehouse? — Download the Research Study](https://lancedb.com/download/)", + "content": "[Why Multimodal Data Needs a Better Lakehouse? \u2014 Download the Research Study](https://lancedb.com/download/)", "dismissible": true }, "colors": { @@ -276,6 +276,7 @@ "group": "AI Platforms & Frameworks", "pages": [ "integrations/ai/agno", + "integrations/ai/huggingface", "integrations/ai/langchain", "integrations/ai/llamaIndex", "integrations/ai/genkit", @@ -283,13 +284,6 @@ "integrations/ai/prompttools", "integrations/ai/synthetic-data-kit" ] - }, - { - "group": "Hugging Face Hub", - "pages": [ - "huggingface/overview", - "huggingface/datasets" - ] } ] } @@ -336,6 +330,92 @@ } ] }, + { + "tab": "Datasets", + "groups": [ + { + "group": "Overview", + "pages": [ + "datasets/index" + ] + }, + { + "group": "Image Classification", + "pages": [ + "datasets/mnist", + "datasets/cifar10", + "datasets/fashion-mnist", + "datasets/food101", + "datasets/oxford-pets", + "datasets/stanford-cars", + "datasets/imagenet-1k-val", + "datasets/eurosat" + ] + }, + { + "group": "Object Detection & Segmentation", + "pages": [ + "datasets/coco-detection-2017", + "datasets/pascal-voc-2012-segmentation", + "datasets/ade20k", + "datasets/kitti-2d-detection" + ] + }, + { + "group": "Image Retrieval", + "pages": [ + "datasets/coco-captions-2017", + "datasets/flickr30k", + "datasets/laion-1m" + ] + }, + { + "group": "Visual Question Answering", + "pages": [ + "datasets/chartqa", + "datasets/docvqa", + "datasets/textvqa", + "datasets/vqav2", + "datasets/gqa-testdev-balanced" + ] + }, + { + "group": "Text QA", + "pages": [ + "datasets/squad-v2", + "datasets/trivia-qa", + "datasets/hotpotqa-distractor", + "datasets/natural-questions-val", + "datasets/ms-marco-v2.1" + ] + }, + { + "group": "Text Corpora", + "pages": [ + "datasets/fineweb-edu" + ] + }, + { + "group": "Speech", + "pages": [ + "datasets/librispeech-clean" + ] + }, + { + "group": "Video", + "pages": [ + "datasets/openvid" + ] + }, + { + "group": "Robotics", + "pages": [ + "datasets/lerobot-pusht", + "datasets/lerobot-xvla-soft-fold" + ] + } + ] + }, { "tab": "API Reference", "groups": [ @@ -422,8 +502,12 @@ "destination": "/enterprise/benchmarks" }, { - "source": "/integrations/ai/huggingface", - "destination": "/huggingface/overview" + "source": "/huggingface/overview", + "destination": "/integrations/ai/huggingface" + }, + { + "source": "/huggingface/datasets", + "destination": "/datasets" } ] } diff --git a/docs/huggingface/datasets.mdx b/docs/huggingface/datasets.mdx deleted file mode 100644 index d0d42c91..00000000 --- a/docs/huggingface/datasets.mdx +++ /dev/null @@ -1,179 +0,0 @@ ---- -title: "Datasets" -sidebarTitle: "Datasets" -description: "Browse Lance-format datasets ready to query on the Hugging Face Hub." ---- - -The [`lance-format`](https://huggingface.co/lance-format) organization on Hugging Face publishes a growing -catalog of multimodal datasets in Lance format. Each one bundles the raw data (images, audio, video, or text), -pre-computed embeddings, and on-disk vector / full-text indices as first-class columns in the same dataset — -so vector search, full-text search, and filtered scans work directly via `hf://` URIs without downloading. - -This is powered under the hood by the [Lance format's native Hugging Face integration](https://lance.org/integrations/huggingface/) -(via the [`pylance`](https://pypi.org/project/pylance/) library). LanceDB sits on top of Lance and gives you a -convenient table-style interface to query these datasets straight from the Hub: - -```python -import lancedb - -db = lancedb.connect("hf://datasets/lance-format//data") -tbl = db.open_table("train") - -# Vector search, full-text search, or filtered scans — directly on the Hub -results = tbl.search(query).limit(10).to_list() -``` - -Click any card below to view a dataset on the Hub. For a complete walkthrough, -see the [Hugging Face Hub overview](/huggingface/overview). - -## Image Classification - - - - `lance-format/mnist-lance` — 70,000 28×28 grayscale handwritten digits. The classic image-classification benchmark. - - - `lance-format/cifar10-lance` — 60,000 32×32 RGB images across 10 everyday object classes. - - - `lance-format/fashion-mnist-lance` — 70,000 28×28 grayscale clothing images across 10 categories. A harder drop-in replacement for MNIST. - - - `lance-format/food101-lance` — 101,000 food photographs across 101 dish classes. Fine-grained image classification. - - - `lance-format/oxford-pets-lance` — 7,390 cat and dog photos across 37 breeds, with an `is_dog` filter column. - - - `lance-format/stanford-cars-lance` — 8,144 images across 196 fine-grained car make / model / year classes, with BLIP captions. - - - `lance-format/imagenet-1k-val-lance` — The canonical 50,000-image ILSVRC2012 validation split. Research use only. - - - `lance-format/eurosat-lance` — 27,000 Sentinel-2 satellite tiles across 10 land-cover classes. The canonical remote-sensing classification benchmark. - - - -## Object Detection & Segmentation - - - - `lance-format/coco-detection-2017-lance` — 123,287 images with per-image bounding boxes and category labels across 80 classes. - - - `lance-format/pascal-voc-2012-segmentation-lance` — 2,913 image / mask pairs across 20 semantic classes. Small-scale semantic-segmentation benchmark. - - - `lance-format/ade20k-lance` — 27,574 scene images with semantic and instance segmentation maps, scene labels, and per-object metadata. - - - `lance-format/kitti-2d-detection-lance` — 7,481 driving-scene images with 2D + 3D bounding-box annotations. The canonical autonomous-driving benchmark. - - - -## Image Retrieval - - - - `lance-format/coco-captions-2017-lance` — Each row is one image paired with 5–7 human-written captions. The standard image-captioning corpus. - - - `lance-format/flickr30k-lance` — 31,783 images each paired with 5 human-written captions. - - - `lance-format/laion-1m` — ~1 million image–caption pairs from the LAION-5B corpus with rich metadata (URL, similarity, NSFW flags). - - - -## Visual Question Answering - - - - `lance-format/chartqa-lance` — 2,500 VQA test questions over scientific and business charts that combine logical and visual reasoning. - - - `lance-format/docvqa-lance` — ~10.5K VQA pairs over document images: forms, receipts, scans, and multi-page reports. - - - `lance-format/textvqa-lance` — 39,602 VQA pairs where every answer requires reading text inside the image. Includes per-image OCR tokens. - - - `lance-format/vqav2-lance` — 214,354 (image, question, 10 answers) triples over COCO images. The standard VQA benchmark. - - - `lance-format/gqa-testdev-balanced-lance` — 12,578 compositional VQA questions over 398 scene-graph-grounded images, with reasoning-program tags. - - - -## Text QA - - - - `lance-format/squad-v2-lance` — 142,192 Wikipedia-grounded questions including unanswerable "is impossible" examples. Stanford's reading-comprehension benchmark. - - - `lance-format/trivia-qa-lance` — 156,328 trivia questions paired with canonical answers and all accepted answer aliases. - - - `lance-format/hotpotqa-distractor-lance` — 97,852 multi-hop QA examples where each answer requires combining facts from two Wikipedia paragraphs. - - - `lance-format/natural-questions-val-lance` — 7,830 real Google search queries paired with the full Wikipedia article each answer lives in. - - - `lance-format/ms-marco-v2.1-lance` — ~910K real Bing user queries, each with 10 candidate passages and human-written reference answers. - - - -## Text Corpora - - - - `lance-format/fineweb-edu` — A multi-million-passage educational web corpus with cleaned text and rich metadata, designed for retrieval-heavy workloads. - - - -## Speech - - - - `lance-format/librispeech-clean-lance` — ~34K English audiobook utterances (FLAC) with reference transcripts. The standard ASR benchmark. - - - -## Video - - - - `lance-format/openvid-lance` — ~938K high-quality short videos with captions and per-clip aesthetic, motion, and camera-motion scores. - - - -## Robotics - - - - `lance-format/lerobot-pusht-lance` — The canonical PushT benchmark from the Diffusion Policy paper, published as three Lance tables (`frames`, `episodes`, `videos`). - - - `lance-format/lerobot-xvla-soft-fold` — 1,542 episodes / 2.85M frames at 20 FPS across 3 camera streams, with aligned robot state and action vectors. - - - -## Share your own dataset - -Got a multimodal dataset you want to publish? Convert it to Lance and push it to the Hub! -Anyone who opens it gets vector search, full-text search, and filtered scans on the data out of the box, -without recreating the embeddings or indexes on their end. - - -A step-by-step walkthrough on the LanceDB blog covering CLI setup, packaging your dataset, pushing to your namespace, and writing a dataset card. - - -Or browse the [latest trending Lance datasets](https://huggingface.co/datasets?format=format:lance&sort=trending) on Hugging Face. - diff --git a/docs/huggingface/overview.mdx b/docs/integrations/ai/huggingface.mdx similarity index 99% rename from docs/huggingface/overview.mdx rename to docs/integrations/ai/huggingface.mdx index 7350a3ff..58be5d4d 100644 --- a/docs/huggingface/overview.mdx +++ b/docs/integrations/ai/huggingface.mdx @@ -1,6 +1,6 @@ --- title: "Hugging Face Hub" -sidebarTitle: "Overview" +sidebarTitle: "Hugging Face" description: "Use LanceDB directly on Lance datasets hosted on the Hugging Face Hub for multimodal search and retrieval." --- diff --git a/pyproject.toml b/pyproject.toml index c9f2739a..055e53f6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,4 +14,5 @@ dependencies = [ "pytest-asyncio>=1.3.0", "Pillow>=12.1.1", "geneva>=0.12.0", + "pyyaml>=6.0", ] diff --git a/scripts/hf_datasets.yaml b/scripts/hf_datasets.yaml new file mode 100644 index 00000000..c149b72c --- /dev/null +++ b/scripts/hf_datasets.yaml @@ -0,0 +1,70 @@ +# Lance datasets on Hugging Face — sync config for `make hf-sync`. +# +# Each entry maps three things explicitly because directory names in +# lance-format/lance-huggingface, the HF Hub repo slugs, and the URL slugs we +# want under /datasets/ do not have a derivable relationship (e.g. +# imagenet1k_val -> imagenet-1k-val-lance -> /datasets/imagenet-1k-val). +# +# Fields: +# dir directory inside lance-format/lance-huggingface containing HF_DATASET_CARD.md +# slug output filename and URL slug under /datasets/ (no .mdx extension) +# hf HF Hub repo name under lance-format/ (drives the "View on Hugging Face" link) +# title human-readable display name; used for page title, sidebar, and landing-page card + +categories: + - name: "Image Classification" + datasets: + - { dir: mnist, slug: mnist, hf: mnist-lance, title: "MNIST" } + - { dir: cifar10, slug: cifar10, hf: cifar10-lance, title: "CIFAR-10" } + - { dir: fashion_mnist, slug: fashion-mnist, hf: fashion-mnist-lance, title: "Fashion-MNIST" } + - { dir: food101, slug: food101, hf: food101-lance, title: "Food-101" } + - { dir: oxford_pets, slug: oxford-pets, hf: oxford-pets-lance, title: "Oxford-IIIT Pet" } + - { dir: stanford_cars, slug: stanford-cars, hf: stanford-cars-lance, title: "Stanford Cars" } + - { dir: imagenet1k_val, slug: imagenet-1k-val, hf: imagenet-1k-val-lance, title: "ImageNet-1k Validation" } + - { dir: eurosat, slug: eurosat, hf: eurosat-lance, title: "EuroSAT" } + + - name: "Object Detection & Segmentation" + datasets: + - { dir: coco_detection_2017, slug: coco-detection-2017, hf: coco-detection-2017-lance, title: "COCO 2017 Detection" } + - { dir: pascal_voc_2012, slug: pascal-voc-2012-segmentation, hf: pascal-voc-2012-segmentation-lance, title: "Pascal VOC 2012 Segmentation" } + - { dir: ade20k, slug: ade20k, hf: ade20k-lance, title: "ADE20K" } + - { dir: kitti, slug: kitti-2d-detection, hf: kitti-2d-detection-lance, title: "KITTI 2D Detection" } + + - name: "Image Retrieval" + datasets: + - { dir: coco_captions_2017, slug: coco-captions-2017, hf: coco-captions-2017-lance, title: "COCO Captions 2017" } + - { dir: flickr30k, slug: flickr30k, hf: flickr30k-lance, title: "Flickr30k" } + - { dir: laion-1M, slug: laion-1m, hf: laion-1m, title: "LAION-1M" } + + - name: "Visual Question Answering" + datasets: + - { dir: chartqa, slug: chartqa, hf: chartqa-lance, title: "ChartQA" } + - { dir: docvqa, slug: docvqa, hf: docvqa-lance, title: "DocVQA" } + - { dir: textvqa, slug: textvqa, hf: textvqa-lance, title: "TextVQA" } + - { dir: vqav2, slug: vqav2, hf: vqav2-lance, title: "VQAv2" } + - { dir: gqa, slug: gqa-testdev-balanced, hf: gqa-testdev-balanced-lance, title: "GQA testdev-balanced" } + + - name: "Text QA" + datasets: + - { dir: squad_v2, slug: squad-v2, hf: squad-v2-lance, title: "SQuAD v2" } + - { dir: triviaqa, slug: trivia-qa, hf: trivia-qa-lance, title: "TriviaQA" } + - { dir: hotpotqa, slug: hotpotqa-distractor, hf: hotpotqa-distractor-lance, title: "HotpotQA distractor" } + - { dir: natural_questions, slug: natural-questions-val, hf: natural-questions-val-lance, title: "Natural Questions Validation" } + - { dir: ms_marco, slug: ms-marco-v2.1, hf: ms-marco-v2.1-lance, title: "MS MARCO v2.1" } + + - name: "Text Corpora" + datasets: + - { dir: fineweb_edu, slug: fineweb-edu, hf: fineweb-edu, title: "FineWeb-Edu" } + + - name: "Speech" + datasets: + - { dir: librispeech, slug: librispeech-clean, hf: librispeech-clean-lance, title: "LibriSpeech clean" } + + - name: "Video" + datasets: + - { dir: openvid_hf, slug: openvid, hf: openvid-lance, title: "OpenVid-1M" } + + - name: "Robotics" + datasets: + - { dir: lerobot/pusht, slug: lerobot-pusht, hf: lerobot-pusht-lance, title: "LeRobot PushT" } + - { dir: lerobot/xvla-soft-fold, slug: lerobot-xvla-soft-fold, hf: lerobot-xvla-soft-fold, title: "LeRobot X-VLA Soft-Fold" } diff --git a/scripts/sync_hf_datasets.py b/scripts/sync_hf_datasets.py new file mode 100644 index 00000000..8537055b --- /dev/null +++ b/scripts/sync_hf_datasets.py @@ -0,0 +1,377 @@ +""" +Sync Lance dataset cards from `lance-format/lance-huggingface` into Mintlify pages. + +For each entry in `scripts/hf_datasets.yaml`, this script: + + 1. Fetches `HF_DATASET_CARD.md` from the GitHub raw URL. + 2. Transforms it into MDX (rewrites frontmatter, strips the H1, injects a + "View on Hugging Face" card). + 3. Writes `docs/datasets/.mdx`. + +It also keeps `docs/docs.json` and the auto-generated card grid in +`docs/datasets/index.mdx` in sync with the same config file, so adding a new +dataset is a single-line edit to `hf_datasets.yaml` plus `make hf-sync`. + +Hard-fails on any fetch error or unexpected card structure — partial +regeneration would be worse than a clear failure. +""" + +from __future__ import annotations + +import argparse +import json +import re +import sys +import urllib.error +import urllib.request +from dataclasses import dataclass +from pathlib import Path +from typing import Iterable + +import yaml + +REPO_ROOT = Path(__file__).resolve().parent.parent +CONFIG_PATH = REPO_ROOT / "scripts" / "hf_datasets.yaml" +DATASETS_DIR = REPO_ROOT / "docs" / "datasets" +INDEX_PATH = DATASETS_DIR / "index.mdx" +DOCS_JSON_PATH = REPO_ROOT / "docs" / "docs.json" + +RAW_URL = "https://raw.githubusercontent.com/lance-format/lance-huggingface/main/{dir}/HF_DATASET_CARD.md" +HF_URL = "https://huggingface.co/datasets/lance-format/{hf}" + +SYNC_START = "{/* HF_SYNC:START — content between these markers is generated by `make hf-sync`. Do not edit by hand. */}" +SYNC_END = "{/* HF_SYNC:END */}" + +DESCRIPTION_MAX_CHARS = 250 + + +@dataclass(frozen=True) +class Dataset: + dir: str + slug: str + hf: str + title: str + category: str + + +@dataclass(frozen=True) +class Category: + name: str + datasets: tuple[Dataset, ...] + + +# ----- config loading --------------------------------------------------------- + + +def load_config() -> list[Category]: + with CONFIG_PATH.open() as f: + raw = yaml.safe_load(f) + categories: list[Category] = [] + for cat in raw["categories"]: + datasets = tuple( + Dataset( + dir=d["dir"], + slug=d["slug"], + hf=d["hf"], + title=d["title"], + category=cat["name"], + ) + for d in cat["datasets"] + ) + categories.append(Category(name=cat["name"], datasets=datasets)) + return categories + + +# ----- fetching --------------------------------------------------------------- + + +def fetch_card(ds: Dataset) -> str: + url = RAW_URL.format(dir=ds.dir) + try: + with urllib.request.urlopen(url, timeout=30) as resp: + if resp.status != 200: + raise RuntimeError(f"{ds.slug}: HTTP {resp.status} from {url}") + return resp.read().decode("utf-8") + except urllib.error.HTTPError as exc: + raise RuntimeError(f"{ds.slug}: HTTP {exc.code} from {url}") from exc + except urllib.error.URLError as exc: + raise RuntimeError(f"{ds.slug}: network error fetching {url}: {exc}") from exc + + +# ----- card transform --------------------------------------------------------- + + +FRONTMATTER_RE = re.compile(r"\A---\n(.*?)\n---\n?", re.DOTALL) +H1_RE = re.compile(r"^# .+?\n", re.MULTILINE) +LINK_RE = re.compile(r"\[([^\]]+)\]\([^)]+\)") +INLINE_CODE_RE = re.compile(r"`([^`]+)`") +BOLD_ITALIC_RE = re.compile(r"\*+([^*]+)\*+") +BARE_URL_RE = re.compile(r"https?://\S+") +WHITESPACE_RE = re.compile(r"\s+") +BIBTEX_OPEN_RE = re.compile(r"^@\w+\{") +EMPTY_ANGLE_RE = re.compile(r"<>") + + +def split_frontmatter(text: str) -> tuple[dict, str]: + """Return (parsed YAML frontmatter, body without the frontmatter block).""" + m = FRONTMATTER_RE.match(text) + if not m: + return {}, text + fm = yaml.safe_load(m.group(1)) or {} + body = text[m.end() :] + return fm, body + + +def first_paragraph(body: str) -> str: + """Return the first non-empty prose paragraph, skipping headers and code fences.""" + in_code = False + para: list[str] = [] + for line in body.splitlines(): + stripped = line.strip() + if stripped.startswith("```"): + in_code = not in_code + if para: + break + continue + if in_code: + continue + if not stripped: + if para: + break + continue + if stripped.startswith("#"): + continue + para.append(stripped) + return " ".join(para) + + +def plain_text(s: str) -> str: + """Strip markdown formatting + URLs to a clean prose string.""" + s = LINK_RE.sub(r"\1", s) + s = BARE_URL_RE.sub("", s) + s = INLINE_CODE_RE.sub(r"\1", s) + s = BOLD_ITALIC_RE.sub(r"\1", s) + s = WHITESPACE_RE.sub(" ", s).strip() + return s + + +def truncate(s: str, limit: int) -> str: + if len(s) <= limit: + return s + cut = s[: limit - 1].rsplit(" ", 1)[0] + return cut.rstrip(",.;:") + "…" + + +def sanitize_for_mdx(body: str) -> str: + """Defensive fixes for known MDX hazards in upstream cards. + + Walks line-by-line so transforms apply only to prose, not code. + + - Wraps orphan bibtex entries (lines starting with `@article{`, `@inproceedings{`, + etc. outside a fenced block) in a ```bibtex fence. MDX otherwise parses + `{...}` as a JSX expression and rejects unbalanced citation contents. + - Escapes the empty JSX-fragment literal `<>` (e.g. `Lance<>HF` in prose) + that MDX treats as an unclosed React fragment. + """ + lines = body.splitlines() + out: list[str] = [] + in_fence = False + i = 0 + while i < len(lines): + line = lines[i] + stripped = line.lstrip() + if stripped.startswith("```"): + in_fence = not in_fence + out.append(line) + i += 1 + continue + if not in_fence and BIBTEX_OPEN_RE.match(stripped): + depth = 0 + j = i + while j < len(lines): + depth += lines[j].count("{") - lines[j].count("}") + if depth <= 0: + break + j += 1 + out.append("```bibtex") + out.extend(lines[i : j + 1]) + out.append("```") + i = j + 1 + continue + if not in_fence: + # Escape `<>` outside inline code spans. + parts = line.split("`") + for k in range(0, len(parts), 2): + parts[k] = EMPTY_ANGLE_RE.sub("<>", parts[k]) + line = "`".join(parts) + out.append(line) + i += 1 + return "\n".join(out) + + +def transform_card(ds: Dataset, card_text: str) -> tuple[str, str]: + """Convert an upstream HF dataset card to a Mintlify-ready MDX page. + + Returns (mdx_page_content, description) — the description is reused by + the landing-page card grid so summaries stay in sync. + """ + _, body = split_frontmatter(card_text) + body = H1_RE.sub("", body, count=1).lstrip() + + description = truncate(plain_text(first_paragraph(body)), DESCRIPTION_MAX_CHARS) + body = sanitize_for_mdx(body) + hf_url = HF_URL.format(hf=ds.hf) + + frontmatter = ( + "---\n" + f'title: "{ds.title}"\n' + f'sidebarTitle: "{ds.title}"\n' + f'description: "{description}"\n' + "---\n\n" + ) + hf_card = ( + f'\n" + f"Source dataset card and downloadable files for `lance-format/{ds.hf}`.\n" + f"\n\n" + ) + return frontmatter + hf_card + body.rstrip() + "\n", description + + +# ----- index.mdx card grid ---------------------------------------------------- + + +def render_card_grid( + categories: Iterable[Category], descriptions: dict[str, str] +) -> str: + parts: list[str] = [] + for cat in categories: + parts.append(f"## {cat.name}\n") + parts.append("") + for ds in cat.datasets: + desc = descriptions[ds.slug] + parts.append( + f' \n' + f" `lance-format/{ds.hf}` — {desc}\n" + f" " + ) + parts.append("\n") + return "\n".join(parts) + + +def update_index_page( + categories: list[Category], descriptions: dict[str, str] +) -> None: + text = INDEX_PATH.read_text() + start = text.find(SYNC_START) + end = text.find(SYNC_END) + if start == -1 or end == -1 or end < start: + raise RuntimeError( + f"Could not find HF_SYNC markers in {INDEX_PATH.relative_to(REPO_ROOT)}. " + "Restore the start/end markers and re-run." + ) + new_body = ( + text[: start + len(SYNC_START)] + + "\n\n" + + render_card_grid(categories, descriptions) + + "\n" + + text[end:] + ) + INDEX_PATH.write_text(new_body) + + +# ----- docs.json -------------------------------------------------------------- + + +def update_docs_json(categories: list[Category]) -> None: + with DOCS_JSON_PATH.open() as f: + docs = json.load(f) + + groups = [{"group": "Overview", "pages": ["datasets/index"]}] + for cat in categories: + groups.append( + { + "group": cat.name, + "pages": [f"datasets/{ds.slug}" for ds in cat.datasets], + } + ) + + tabs = docs["navigation"]["tabs"] + for tab in tabs: + if tab.get("tab") == "Datasets": + tab["groups"] = groups + break + else: + raise RuntimeError( + "No 'Datasets' tab found in docs.json — add the tab scaffold before running sync." + ) + + DOCS_JSON_PATH.write_text(json.dumps(docs, indent=2) + "\n") + + +# ----- driver ----------------------------------------------------------------- + + +def sync(dry_run: bool = False) -> None: + categories = load_config() + all_datasets = [ds for cat in categories for ds in cat.datasets] + print( + f"Syncing {len(all_datasets)} datasets across " + f"{len(categories)} categories…" + ) + + DATASETS_DIR.mkdir(parents=True, exist_ok=True) + + config_slugs = {ds.slug for ds in all_datasets} + existing_pages = { + p.stem for p in DATASETS_DIR.glob("*.mdx") if p.stem != "index" + } + stale = existing_pages - config_slugs + if stale: + print(f" Will remove stale pages: {sorted(stale)}") + + descriptions: dict[str, str] = {} + for ds in all_datasets: + print(f" • {ds.category} / {ds.slug} ← {ds.dir}") + card = fetch_card(ds) + mdx, description = transform_card(ds, card) + descriptions[ds.slug] = description + out_path = DATASETS_DIR / f"{ds.slug}.mdx" + if dry_run: + print(f" (dry run, would write {out_path.relative_to(REPO_ROOT)})") + else: + out_path.write_text(mdx) + + if not dry_run: + for slug in stale: + (DATASETS_DIR / f"{slug}.mdx").unlink() + update_index_page(categories, descriptions) + update_docs_json(categories) + print( + f"Wrote {len(all_datasets)} dataset pages, updated " + f"{INDEX_PATH.relative_to(REPO_ROOT)} and " + f"{DOCS_JSON_PATH.relative_to(REPO_ROOT)}." + ) + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--dry-run", + action="store_true", + help="Fetch and transform, but do not write any files.", + ) + args = parser.parse_args() + try: + sync(dry_run=args.dry_run) + except RuntimeError as exc: + print(f"error: {exc}", file=sys.stderr) + return 1 + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/uv.lock b/uv.lock index 6a271e9d..3e830566 100644 --- a/uv.lock +++ b/uv.lock @@ -374,6 +374,7 @@ dependencies = [ { name = "pyarrow" }, { name = "pytest" }, { name = "pytest-asyncio" }, + { name = "pyyaml" }, ] [package.metadata] @@ -387,6 +388,7 @@ requires-dist = [ { name = "pyarrow", specifier = ">=23.0.1" }, { name = "pytest", specifier = ">=9.0.1" }, { name = "pytest-asyncio", specifier = ">=1.3.0" }, + { name = "pyyaml", specifier = ">=6.0" }, ] [[package]] From 6b79fd868ccbbe5aad8c9e4fb0ac4630b2aeab55 Mon Sep 17 00:00:00 2001 From: prrao87 <35005448+prrao87@users.noreply.github.com> Date: Tue, 12 May 2026 10:38:40 -0700 Subject: [PATCH 2/3] fix(datasets): drop period from ms-marco slug so Mintlify resolves the page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The slug ms-marco-v2.1 contains a period that Mintlify can mis-parse as a file extension, which can silently drop the entire Datasets tab from the top nav. Use ms-marco-v2 instead — the v2.1 detail is still in the page title and the source repo name on Hugging Face. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/datasets/index.mdx | 2 +- docs/datasets/{ms-marco-v2.1.mdx => ms-marco-v2.mdx} | 0 docs/docs.json | 2 +- scripts/hf_datasets.yaml | 2 +- 4 files changed, 3 insertions(+), 3 deletions(-) rename docs/datasets/{ms-marco-v2.1.mdx => ms-marco-v2.mdx} (100%) diff --git a/docs/datasets/index.mdx b/docs/datasets/index.mdx index 7879f128..7f43b7ff 100644 --- a/docs/datasets/index.mdx +++ b/docs/datasets/index.mdx @@ -123,7 +123,7 @@ integration itself, see the [Hugging Face Hub integration page](/integrations/ai `lance-format/natural-questions-val-lance` — Lance-formatted version of the Natural Questions validation split — 7,830 real Google search queries with their full Wikipedia articles and 1–5 annotator labels per question. Sourced from google-research-datasets/natural_questions. - + `lance-format/ms-marco-v2.1-lance` — Lance-formatted version of MS MARCO v2.1 — Microsoft's machine reading comprehension benchmark — with MiniLM query embeddings stored inline alongside the candidate passages and human-written answers. diff --git a/docs/datasets/ms-marco-v2.1.mdx b/docs/datasets/ms-marco-v2.mdx similarity index 100% rename from docs/datasets/ms-marco-v2.1.mdx rename to docs/datasets/ms-marco-v2.mdx diff --git a/docs/docs.json b/docs/docs.json index 0f69d23c..a9f9c400 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -386,7 +386,7 @@ "datasets/trivia-qa", "datasets/hotpotqa-distractor", "datasets/natural-questions-val", - "datasets/ms-marco-v2.1" + "datasets/ms-marco-v2" ] }, { diff --git a/scripts/hf_datasets.yaml b/scripts/hf_datasets.yaml index c149b72c..3f7b7c82 100644 --- a/scripts/hf_datasets.yaml +++ b/scripts/hf_datasets.yaml @@ -50,7 +50,7 @@ categories: - { dir: triviaqa, slug: trivia-qa, hf: trivia-qa-lance, title: "TriviaQA" } - { dir: hotpotqa, slug: hotpotqa-distractor, hf: hotpotqa-distractor-lance, title: "HotpotQA distractor" } - { dir: natural_questions, slug: natural-questions-val, hf: natural-questions-val-lance, title: "Natural Questions Validation" } - - { dir: ms_marco, slug: ms-marco-v2.1, hf: ms-marco-v2.1-lance, title: "MS MARCO v2.1" } + - { dir: ms_marco, slug: ms-marco-v2, hf: ms-marco-v2.1-lance, title: "MS MARCO v2.1" } - name: "Text Corpora" datasets: From b0504d051ba7acc18cf527928962814843d21e6c Mon Sep 17 00:00:00 2001 From: prrao87 <35005448+prrao87@users.noreply.github.com> Date: Tue, 12 May 2026 11:04:47 -0700 Subject: [PATCH 3/3] Update snippets --- docs/snippets/search.mdx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/snippets/search.mdx b/docs/snippets/search.mdx index 2c61812e..591ef0b3 100644 --- a/docs/snippets/search.mdx +++ b/docs/snippets/search.mdx @@ -8,10 +8,10 @@ export const PyBasicHybridSearch = "data = [\n {\"text\": \"rebel spaceships export const PyBasicHybridSearchAsync = "uri = \"data/sample-lancedb\"\nasync_db = await lancedb.connect_async(uri)\ndata = [\n {\"text\": \"rebel spaceships striking from a hidden base\"},\n {\"text\": \"have won their first victory against the evil Galactic Empire\"},\n {\"text\": \"during the battle rebel spies managed to steal secret plans\"},\n {\"text\": \"to the Empire's ultimate weapon the Death Star\"},\n]\nasync_tbl = await async_db.create_table(\"documents_async\", schema=Documents)\n# ingest docs with auto-vectorization\nawait async_tbl.add(data)\n# Create a fts index before the hybrid search\nawait async_tbl.create_index(\"text\", config=FTS())\ntext_query = \"flower moon\"\n# hybrid search with default re-ranker\nawait (await async_tbl.search(\"flower moon\", query_type=\"hybrid\")).to_pandas()\n"; -export const PyClassDocuments = "class Documents(LanceModel):\n vector: Vector(embeddings.ndims()) = embeddings.VectorField()\n text: str = embeddings.SourceField()\n"; - export const PyClassDefinition = "class Metadata(BaseModel):\n source: str\n timestamp: datetime\n\n\nclass Document(BaseModel):\n content: str\n meta: Metadata\n\n\nclass LanceSchema(LanceModel):\n id: str\n vector: Vector(1536)\n payload: Document\n"; +export const PyClassDocuments = "class Documents(LanceModel):\n vector: Vector(embeddings.ndims()) = embeddings.VectorField()\n text: str = embeddings.SourceField()\n"; + export const PyCreateTableAsyncWithNestedSchema = "# Let's add 100 sample rows to our dataset\ndata = [\n LanceSchema(\n id=f\"id{i}\",\n vector=np.random.randn(1536),\n payload=Document(\n content=f\"document{i}\",\n meta=Metadata(source=f\"source{i % 10}\", timestamp=datetime.now()),\n ),\n )\n for i in range(100)\n]\n\nasync_tbl = await async_db.create_table(\n \"documents_async\", data=data, mode=\"overwrite\"\n)\n"; export const PyCreateTableWithNestedSchema = "# Let's add 100 sample rows to our dataset\ndata = [\n LanceSchema(\n id=f\"id{i}\",\n vector=np.random.randn(1536),\n payload=Document(\n content=f\"document{i}\",\n meta=Metadata(source=f\"source{i % 10}\", timestamp=datetime.now()),\n ),\n )\n for i in range(100)\n]\n\n# Synchronous client\ntbl = db.create_table(\"documents\", data=data, mode=\"overwrite\")\n";