diff --git a/docs/competition_creation.md b/docs/competition_creation.md index 79cec1c9..72b023a2 100644 --- a/docs/competition_creation.md +++ b/docs/competition_creation.md @@ -7,8 +7,9 @@ public competition-creation API endpoints (kagglesdk 0.1.31+): - [`kaggle competitions create`](#kaggle-competitions-create) - [`kaggle competitions pages create`](#kaggle-competitions-pages-create) - [`kaggle competitions launch`](#kaggle-competitions-launch) +- [`kaggle competitions data update`](#kaggle-competitions-data-update) -All four commands require an authenticated session +All of these commands require an authenticated session (`kaggle config set username/password` or an API token). A typical end-to-end host workflow looks like: @@ -27,11 +28,14 @@ kaggle competitions create -p ./my-comp kaggle competitions pages create my-comp-slug --name description -f ./description.md --publish kaggle competitions pages create my-comp-slug --name rules -f ./rules.md --publish -# 5. Launch the competition (now, or schedule a future UTC time). +# 5. Update the competition data (train.csv, test.csv, sample_submission.csv, ...). +kaggle competitions data update my-comp-slug -p ./data -m "Initial release" + +# 6. Launch the competition (now, or schedule a future UTC time). kaggle competitions launch my-comp-slug --at 2027-01-01T00:00:00Z ``` -The four commands are independent — for example, you can call `pages create` +These commands are independent — for example, you can call `pages create` on a competition that already exists, or use `launch` on a competition created via the host wizard. @@ -340,3 +344,73 @@ kaggle competitions launch my-comp --at 2027-01-01T00:00:00Z A competition can only be launched once. Subsequent calls will be rejected by the backend. + +--- + +## `kaggle competitions data update` + +Creates a new version of the data files for a competition you host. Uploads +via the standard blob-upload pipeline, then sends a single request bundling +the uploaded tokens. Each update **replaces the prior version's file set in +full** — there is no per-file "keep from previous" mode in v1, so list every +file you want in the new version. + +**Usage:** + +```bash +kaggle competitions data update -p -m "" \ + [--rerun] [--include-hidden] +``` + +**Arguments:** + +- ``: The competition slug. + +**Options:** + +- `-p, --path ` (required): Either a **directory** (walked recursively — + every file becomes an upload with its relative path preserved in the API's + `name` field, e.g. `train/images/img1.jpg`), or a **single archive file** + (e.g. a pre-packed `.zip` or `.tar`) uploaded as-is. Sub-directories are + always traversed; hidden entries (see `--include-hidden`) are the only files + skipped by default. +- `-m, --message ""` (required): Notes describing this version + (e.g. `"Added test set"`). +- `--rerun` (optional): Update the RERUN databundle — the private host-only + data swapped in during rerun scoring. Requires Kaggle admin access for now. + Without this flag, the update targets the PUBLIC databundle (what + participants download). +- `--include-hidden` (optional): Upload hidden files and traverse hidden + sub-directories (names starting with `.` — e.g. `.DS_Store`, `.git/`, + `.gitignore`). Skipped by default so you don't accidentally publish OS + metadata or version-control detritus. + +**Examples:** + +```bash +# Update using a directory tree (recurses into sub-folders). +kaggle competitions data update my-comp -p ./data -m "Initial release" + +# Update using a pre-packed archive as a single file (useful when you already +# need a zip for other purposes, or for directory-shaped file formats like +# Zarr). +kaggle competitions data update my-comp -p ./data.zip -m "Initial release" + +# New version with a bug-fix. +kaggle competitions data update my-comp -p ./data -m "Fix label encoding in train.csv" + +# Update the private rerun-scoring data. +kaggle competitions data update my-comp -p ./rerun-data \ + -m "Held-out test set" --rerun +``` + +**A note on directory-shaped file formats:** some formats (Zarr, some +TensorFlow SavedModel layouts, etc.) are on-disk directories that are logically +a single unit. If you pass a directory containing such a format, the recursive +walk uploads each internal chunk as its own file — often what you want for +Zarr, since participants can then stream individual chunks. If you'd rather +keep the format as an opaque single upload, pre-pack it into a `.zip` or +`.tar` and pass that file to `-p` instead. + +The command prints the public URL plus the new `databundle_id` and +`databundle_version_id` on success. diff --git a/src/kaggle/api/kaggle_api_extended.py b/src/kaggle/api/kaggle_api_extended.py index af4ac439..99d40cf3 100644 --- a/src/kaggle/api/kaggle_api_extended.py +++ b/src/kaggle/api/kaggle_api_extended.py @@ -101,6 +101,9 @@ ApiCreateCompetitionPageRequest, ApiDeleteCompetitionPageRequest, ApiUpdateCompetitionPageRequest, + ApiCreateCompetitionDataRequest, + ApiCreateCompetitionDataResponse, + ApiCompetitionDataFile, ApiCompetitionPage, ApiCreateCompetitionRequest, ApiCreateCompetitionResponse, @@ -135,6 +138,7 @@ ) from kagglesdk.competitions.types.competition_enums import ( CompetitionListTab, + CompetitionDatabundleType, CompetitionPrivacy, HostSegment, CompetitionSortBy, @@ -2598,6 +2602,129 @@ def competition_delete_page_cli( if self.competition_delete_page(competition_name, page_name, no_confirm=no_confirm): print(f'Page "{page_name}" deleted from competition "{competition_name}".') + def competition_data_update( + self, + competition_name: str, + path: str, + version_notes: str, + rerun: bool = False, + quiet: bool = False, + include_hidden: bool = False, + ) -> ApiCreateCompetitionDataResponse: + """Update (version) the data files for a competition you host. + + Uploads the files at ``path`` via the blob-upload pipeline and sends + a CreateCompetitionData request bundling the resulting tokens. Each + update replaces the prior version's file set in full. + + - If ``path`` is a single file (e.g. a pre-packed .zip or .tar), it is + uploaded as-is; the file's basename becomes its entry name. + - If ``path`` is a directory, it is walked recursively — every file + becomes its own upload with the path relative to ``path`` preserved + in the API's ``name`` field (e.g. ``train/images/img1.jpg``). + Sub-directories are always traversed. Hidden entries (names starting + with ``.``, including ``.DS_Store`` / ``.git`` / ``.gitignore``) are + skipped by default; pass ``include_hidden=True`` to upload them too. + + Args: + competition_name (str): The competition name (slug). + path (str): Path to a directory or a single archive file. + version_notes (str): Notes describing this version (required). + rerun (bool): If True, update the RERUN databundle (private + host-only data used during rerun scoring). + quiet (bool): Suppress per-file upload progress lines. + include_hidden (bool): If True, upload hidden files and traverse + hidden sub-directories. Default False. + + Returns: + ApiCreateCompetitionDataResponse: url, databundle_id, + databundle_version_id of the new version. + """ + if not version_notes or not version_notes.strip(): + raise ValueError("--message/-m version notes are required") + if not os.path.exists(path): + raise ValueError("Invalid path: " + path) + + # Collect (relative_name, full_path) tuples first so we can validate + # and then upload deterministically. + uploads: List[Tuple[str, str]] = [] + if os.path.isfile(path): + uploads.append((os.path.basename(path), path)) + else: + for dirpath, dirnames, filenames in os.walk(path): + if not include_hidden: + # Prune hidden sub-directories in place so os.walk skips them. + dirnames[:] = [d for d in dirnames if not d.startswith(".")] + filenames = [n for n in filenames if not n.startswith(".")] + for name in filenames: + full = os.path.join(dirpath, name) + rel = os.path.relpath(full, path).replace(os.sep, "/") + uploads.append((rel, full)) + uploads.sort() + + if not uploads: + raise ValueError(f"No files found under {path} to upload") + + files: List[ApiCompetitionDataFile] = [] + # TODO: confirm with backend whether competition data should use + # ApiBlobType.INBOX (used here as the closest catch-all) or whether a + # dedicated COMPETITION_DATA blob type needs adding. + with ResumableUploadContext() as upload_context: + for rel_name, full_path in uploads: + upload_file = self._upload_file( + rel_name, full_path, ApiBlobType.INBOX, upload_context, quiet, resources=None + ) + if upload_file is not None: + f = ApiCompetitionDataFile() + f.name = rel_name + f.token = upload_file.token + files.append(f) + + if not files: + raise ValueError("All file uploads failed; nothing to update") + + with self.build_kaggle_client() as kaggle: + request = ApiCreateCompetitionDataRequest() + request.competition_name = competition_name + request.version_notes = version_notes + request.files = files + if rerun: + request.competition_databundle_type = CompetitionDatabundleType.COMPETITION_DATABUNDLE_TYPE_RERUN + return kaggle.competitions.competition_api_client.create_competition_data(request) + + def competition_data_update_cli( + self, + competition=None, + competition_opt=None, + path=None, + version_notes=None, + rerun=False, + quiet=False, + include_hidden=False, + ): + """CLI wrapper for competition_data_update.""" + competition_name = competition or competition_opt + if competition_name is None: + competition_name = self.get_config_value(self.CONFIG_NAME_COMPETITION) + if competition_name is not None and not quiet: + print("Using competition: " + competition_name) + if competition_name is None: + raise ValueError("No competition specified") + if not path: + raise ValueError("-p/--path is required (folder or archive file)") + if not version_notes: + raise ValueError("-m/--message version notes are required") + + response = self.competition_data_update( + competition_name=competition_name, + path=path, + version_notes=version_notes, + rerun=rerun, + quiet=quiet, + include_hidden=include_hidden, + ) + print(f'New data version created for "{competition_name}": {response.url}') + def competition_launch(self, competition_name: str, future_time: Optional[datetime] = None) -> None: """Launch a competition you host, optionally at a future UTC time. diff --git a/src/kaggle/cli.py b/src/kaggle/cli.py index 1390a752..a31c2189 100644 --- a/src/kaggle/cli.py +++ b/src/kaggle/cli.py @@ -558,6 +558,66 @@ def parse_competitions(subparsers) -> None: parser_competitions_pages_delete._action_groups.append(parser_competitions_pages_delete_optional) parser_competitions_pages_delete.set_defaults(func=api.competition_delete_page_cli) + # Competitions data (group: update) + parser_competitions_data = subparsers_competitions.add_parser( + "data", + formatter_class=argparse.RawTextHelpFormatter, + help=Help.command_competitions_data, + ) + subparsers_competitions_data = parser_competitions_data.add_subparsers(title="commands", dest="command") + subparsers_competitions_data.required = True + subparsers_competitions_data.choices = Help.entity_data_choices + + # Competitions data update + parser_competitions_data_update = subparsers_competitions_data.add_parser( + "update", + formatter_class=argparse.RawTextHelpFormatter, + help=Help.command_competitions_data_update, + ) + parser_competitions_data_update_optional = parser_competitions_data_update._action_groups.pop() + parser_competitions_data_update_optional.add_argument( + "competition", nargs="?", default=None, help=Help.param_competition + ) + parser_competitions_data_update_optional.add_argument( + "-c", "--competition", dest="competition_opt", required=False, help=argparse.SUPPRESS + ) + parser_competitions_data_update_optional.add_argument( + "-p", + "--path", + dest="path", + required=True, + help=( + "Path to upload. May be either a directory (walked recursively — " + "sub-directory paths are preserved in each file's name) or a " + "single archive file (e.g. a pre-packed .zip / .tar), which is " + "uploaded as-is." + ), + ) + parser_competitions_data_update_optional.add_argument( + "-m", + "--message", + dest="version_notes", + required=True, + help='Notes describing this version (e.g. "Added test set").', + ) + parser_competitions_data_update_optional.add_argument( + "--rerun", + dest="rerun", + action="store_true", + help="Update the RERUN databundle (private host-only data used during rerun scoring).", + ) + parser_competitions_data_update_optional.add_argument( + "--include-hidden", + dest="include_hidden", + action="store_true", + help="Include hidden files and directories (names starting with '.'). Skipped by default.", + ) + parser_competitions_data_update_optional.add_argument( + "-q", "--quiet", dest="quiet", action="store_true", help=Help.param_quiet + ) + parser_competitions_data_update._action_groups.append(parser_competitions_data_update_optional) + parser_competitions_data_update.set_defaults(func=api.competition_data_update_cli) + # Competitions launch (publish now, or schedule for a future UTC time) parser_competitions_launch = subparsers_competitions.add_parser( "launch", formatter_class=argparse.RawTextHelpFormatter, help=Help.command_competitions_launch @@ -2117,6 +2177,7 @@ class Help(object): "replay", "logs", "pages", + "data", "launch", "init", "create", @@ -2182,6 +2243,7 @@ class Help(object): forums_topics_choices = ["list", "show"] entity_topics_choices = ["list", "show"] entity_pages_choices = ["list", "create", "update", "delete"] + entity_data_choices = ["update"] config_choices = ["view", "set", "unset"] auth_choices = ["login", "print-access-token", "revoke"] @@ -2257,6 +2319,8 @@ class Help(object): command_competitions_pages_create = "Create a new page on a competition you host" command_competitions_pages_update = "Update fields on an existing competition page" command_competitions_pages_delete = "Delete a page from a competition you host" + command_competitions_data = "Manage a competition's data files" + command_competitions_data_update = "Update (version) the data files for a competition you host" command_competitions_launch = "Launch a competition you host, optionally at a future UTC time" command_competitions_init = "Initialize folder with a competition-metadata.json template" command_competitions_create = "Create a new competition from competition-metadata.json" diff --git a/tests/unit/test_competition_data_update.py b/tests/unit/test_competition_data_update.py new file mode 100644 index 00000000..a8653dc9 --- /dev/null +++ b/tests/unit/test_competition_data_update.py @@ -0,0 +1,280 @@ +# coding=utf-8 +import os +import sys +import tempfile +import unittest +from unittest.mock import MagicMock, patch + +sys.path.insert(0, "../..") + +from kaggle.api.kaggle_api_extended import KaggleApi +from kagglesdk.competitions.types.competition_enums import CompetitionDatabundleType + + +def _mock_upload_file(token): + uf = MagicMock() + uf.token = token + return uf + + +def _mock_response(url="https://kaggle.com/c/my-comp", db_id=1, dbv_id=42): + r = MagicMock() + r.url = url + r.databundle_id = db_id + r.databundle_version_id = dbv_id + return r + + +class TestCompetitionDataUpdate(unittest.TestCase): + """Tests for competition_data_update and its CLI wrapper.""" + + def setUp(self): + self.api = KaggleApi.__new__(KaggleApi) + self.tmp = tempfile.mkdtemp() + # Top-level files. + for name in ("train.csv", "test.csv"): + with open(os.path.join(self.tmp, name), "w") as f: + f.write("a,b\n1,2\n") + # Nested files (recursion target). + os.makedirs(os.path.join(self.tmp, "images", "cats")) + with open(os.path.join(self.tmp, "images", "cats", "cat1.png"), "wb") as f: + f.write(b"\x89PNG\r\n\x1a\n") + with open(os.path.join(self.tmp, "images", "dog.png"), "wb") as f: + f.write(b"\x89PNG\r\n\x1a\n") + # Hidden entries (should be skipped by default). + with open(os.path.join(self.tmp, ".DS_Store"), "wb") as f: + f.write(b"\x00") + os.makedirs(os.path.join(self.tmp, ".git")) + with open(os.path.join(self.tmp, ".git", "config"), "w") as f: + f.write("[core]\n") + + def tearDown(self): + import shutil + + shutil.rmtree(self.tmp, ignore_errors=True) + + def _patch_client(self, mock_client, response=None): + mock_kaggle = MagicMock() + mock_kaggle.competitions.competition_api_client.create_competition_data.return_value = ( + response or _mock_response() + ) + mock_client.return_value.__enter__ = MagicMock(return_value=mock_kaggle) + mock_client.return_value.__exit__ = MagicMock(return_value=False) + return mock_kaggle + + @patch.object(KaggleApi, "_upload_file") + @patch.object(KaggleApi, "build_kaggle_client") + def test_update_directory_recurses_and_preserves_paths(self, mock_client, mock_upload): + # One token per file; order matches sorted rel path order. + mock_upload.side_effect = [_mock_upload_file(f"tok-{i}") for i in range(10)] + mock_kaggle = self._patch_client(mock_client) + + self.api.competition_data_update( + competition_name="my-comp", + path=self.tmp, + version_notes="first version", + ) + + request = mock_kaggle.competitions.competition_api_client.create_competition_data.call_args[0][0] + names = sorted(f.name for f in request.files) + self.assertEqual( + names, + ["images/cats/cat1.png", "images/dog.png", "test.csv", "train.csv"], + ) + # Names passed to _upload_file must be the same relative paths (with + # forward slashes) that appear in the API request. + called_names = sorted(call.args[0] for call in mock_upload.call_args_list) + self.assertEqual(called_names, names) + + @patch.object(KaggleApi, "_upload_file") + @patch.object(KaggleApi, "build_kaggle_client") + def test_update_single_file_uploads_as_is(self, mock_client, mock_upload): + archive = os.path.join(self.tmp, "bundle.zip") + with open(archive, "wb") as f: + f.write(b"PK\x03\x04") + mock_upload.return_value = _mock_upload_file("tok-zip") + mock_kaggle = self._patch_client(mock_client) + + self.api.competition_data_update( + competition_name="my-comp", + path=archive, + version_notes="single archive", + ) + + mock_upload.assert_called_once() + # Basename becomes the API entry name; full path is uploaded. + self.assertEqual(mock_upload.call_args.args[0], "bundle.zip") + self.assertEqual(mock_upload.call_args.args[1], archive) + request = mock_kaggle.competitions.competition_api_client.create_competition_data.call_args[0][0] + self.assertEqual(len(request.files), 1) + self.assertEqual(request.files[0].name, "bundle.zip") + self.assertEqual(request.files[0].token, "tok-zip") + + @patch.object(KaggleApi, "_upload_file") + @patch.object(KaggleApi, "build_kaggle_client") + def test_update_skips_hidden_by_default(self, mock_client, mock_upload): + mock_upload.side_effect = [_mock_upload_file(f"tok-{i}") for i in range(10)] + mock_kaggle = self._patch_client(mock_client) + + self.api.competition_data_update( + competition_name="my-comp", + path=self.tmp, + version_notes="notes", + ) + + request = mock_kaggle.competitions.competition_api_client.create_competition_data.call_args[0][0] + names = sorted(f.name for f in request.files) + # Hidden files (.DS_Store) and hidden dirs (.git/) are not included. + self.assertEqual( + names, + ["images/cats/cat1.png", "images/dog.png", "test.csv", "train.csv"], + ) + for n in names: + self.assertFalse(any(part.startswith(".") for part in n.split("/"))) + + @patch.object(KaggleApi, "_upload_file") + @patch.object(KaggleApi, "build_kaggle_client") + def test_update_include_hidden_uploads_them(self, mock_client, mock_upload): + mock_upload.side_effect = [_mock_upload_file(f"tok-{i}") for i in range(10)] + mock_kaggle = self._patch_client(mock_client) + + self.api.competition_data_update( + competition_name="my-comp", + path=self.tmp, + version_notes="notes", + include_hidden=True, + ) + + request = mock_kaggle.competitions.competition_api_client.create_competition_data.call_args[0][0] + names = sorted(f.name for f in request.files) + self.assertIn(".DS_Store", names) + self.assertIn(".git/config", names) + + @patch.object(KaggleApi, "_upload_file") + @patch.object(KaggleApi, "build_kaggle_client") + def test_update_rerun_sets_databundle_type(self, mock_client, mock_upload): + mock_upload.side_effect = [_mock_upload_file(f"tok-{i}") for i in range(10)] + mock_kaggle = self._patch_client(mock_client) + + self.api.competition_data_update( + competition_name="my-comp", + path=self.tmp, + version_notes="rerun data", + rerun=True, + ) + + request = mock_kaggle.competitions.competition_api_client.create_competition_data.call_args[0][0] + self.assertEqual( + request.competition_databundle_type, + CompetitionDatabundleType.COMPETITION_DATABUNDLE_TYPE_RERUN, + ) + + @patch.object(KaggleApi, "_upload_file") + @patch.object(KaggleApi, "build_kaggle_client") + def test_update_default_databundle_type_unspecified(self, mock_client, mock_upload): + mock_upload.side_effect = [_mock_upload_file(f"tok-{i}") for i in range(10)] + mock_kaggle = self._patch_client(mock_client) + + self.api.competition_data_update( + competition_name="my-comp", + path=self.tmp, + version_notes="notes", + ) + + request = mock_kaggle.competitions.competition_api_client.create_competition_data.call_args[0][0] + self.assertEqual( + request.competition_databundle_type, + CompetitionDatabundleType.COMPETITION_DATABUNDLE_TYPE_UNSPECIFIED, + ) + + @patch.object(KaggleApi, "_upload_file", return_value=None) + def test_update_all_uploads_fail_raises(self, mock_upload): + with self.assertRaises(ValueError) as ctx: + self.api.competition_data_update( + competition_name="my-comp", + path=self.tmp, + version_notes="notes", + ) + self.assertIn("All file uploads failed", str(ctx.exception)) + + def test_update_empty_directory_raises(self): + empty = tempfile.mkdtemp() + try: + with self.assertRaises(ValueError) as ctx: + self.api.competition_data_update( + competition_name="my-comp", + path=empty, + version_notes="notes", + ) + self.assertIn("No files found", str(ctx.exception)) + finally: + os.rmdir(empty) + + def test_update_missing_path_raises(self): + with self.assertRaises(ValueError) as ctx: + self.api.competition_data_update( + competition_name="my-comp", + path="/tmp/does-not-exist-9999", + version_notes="notes", + ) + self.assertIn("Invalid path", str(ctx.exception)) + + def test_update_blank_notes_raises(self): + with self.assertRaises(ValueError) as ctx: + self.api.competition_data_update( + competition_name="my-comp", + path=self.tmp, + version_notes=" ", + ) + self.assertIn("version notes are required", str(ctx.exception)) + + @patch.object(KaggleApi, "competition_data_update") + def test_cli_forwards_args(self, mock_update): + mock_update.return_value = _mock_response() + + self.api.competition_data_update_cli( + competition="my-comp", + path=self.tmp, + version_notes="notes", + rerun=True, + include_hidden=True, + ) + + kwargs = mock_update.call_args.kwargs + self.assertEqual(kwargs["competition_name"], "my-comp") + self.assertEqual(kwargs["path"], self.tmp) + self.assertEqual(kwargs["version_notes"], "notes") + self.assertTrue(kwargs["rerun"]) + self.assertTrue(kwargs["include_hidden"]) + + @patch.object(KaggleApi, "competition_data_update") + def test_cli_include_hidden_defaults_false(self, mock_update): + mock_update.return_value = _mock_response() + + self.api.competition_data_update_cli( + competition="my-comp", + path=self.tmp, + version_notes="notes", + ) + + self.assertFalse(mock_update.call_args.kwargs["include_hidden"]) + + def test_cli_missing_path_raises(self): + with self.assertRaises(ValueError) as ctx: + self.api.competition_data_update_cli(competition="my-comp", version_notes="notes") + self.assertIn("-p/--path is required", str(ctx.exception)) + + def test_cli_missing_notes_raises(self): + with self.assertRaises(ValueError) as ctx: + self.api.competition_data_update_cli(competition="my-comp", path=self.tmp) + self.assertIn("version notes are required", str(ctx.exception)) + + def test_cli_missing_competition_raises(self): + self.api.config_values = {} + with self.assertRaises(ValueError) as ctx: + self.api.competition_data_update_cli(path=self.tmp, version_notes="notes") + self.assertIn("No competition specified", str(ctx.exception)) + + +if __name__ == "__main__": + unittest.main()