diff --git a/XGBWW_OpenML_W1W2W7W8W9_for_openml_id.ipynb b/XGBWW_OpenML_W1W2W7W8W9_for_openml_id.ipynb index 2910fcc..34ffc48 100644 --- a/XGBWW_OpenML_W1W2W7W8W9_for_openml_id.ipynb +++ b/XGBWW_OpenML_W1W2W7W8W9_for_openml_id.ipynb @@ -8,7 +8,6 @@ }, "source": "\"Open" }, - { "cell_type": "markdown", "metadata": { @@ -48,7 +47,31 @@ ] } ], - "source": "from pathlib import Path\nfrom datetime import datetime\nimport os\n\nfrom google.colab import drive\n\ndrive.mount('/content/drive', force_remount=False)\n\nRUN_NAME_BASE = \"openml_w1w2w7w8w9_for_openml_id\"\nRUNS_ROOT = Path('/content/drive/MyDrive/xgbww_runs') / RUN_NAME_BASE\nRUNS_ROOT.mkdir(parents=True, exist_ok=True)\n\nCHECKPOINT_EVERY_ROUNDS = 1 # evaluate/save every N rounds\nMAX_ROUNDS = 10000 # safety cap\nTARGET_ALPHA = 1.5 # stop when all tracked W-matrix alphas <= this value\nTEST_SIZE = 0.2\nRANDOM_STATE = 42\nFORCE_FRESH_START = True # True = ignore prior checkpoints and start over for selected dataset\nRESTART_RUNTIME_AFTER_INSTALL = False\nREUSE_LAST_MODEL = False # True = resume last selected OpenML dataset\nOPENML_MODEL_ID_INPUT = \"RANDOM\" # Set to RANDOM or an OpenML dataset/model id (e.g., 1049)\n\nprint('Runs root:', RUNS_ROOT)\nprint('Started at:', datetime.utcnow().isoformat() + 'Z')\n", + "source": [ + "from pathlib import Path\n", + "from datetime import datetime\n", + "import os\n", + "\n", + "from google.colab import drive\n", + "\n", + "drive.mount('/content/drive', force_remount=False)\n", + "\n", + "RUNS_ROOT = Path('/content/drive/MyDrive/xgbww_runs')\n", + "RUNS_ROOT.mkdir(parents=True, exist_ok=True)\n", + "\n", + "CHECKPOINT_EVERY_ROUNDS = 1 # evaluate/save every N rounds\n", + "MAX_ROUNDS = 10000 # safety cap\n", + "TARGET_ALPHA = 1.5 # stop when all tracked W-matrix alphas <= this value\n", + "TEST_SIZE = 0.2\n", + "RANDOM_STATE = 42\n", + "FORCE_FRESH_START = True # True = ignore prior checkpoints and start over for selected dataset\n", + "RESTART_RUNTIME_AFTER_INSTALL = False\n", + "REUSE_LAST_MODEL = False # True = resume last selected OpenML dataset\n", + "OPENML_MODEL_ID_INPUT = \"RANDOM\" # Set to RANDOM or an OpenML dataset/model id (e.g., 1049)\n", + "\n", + "print('Runs root:', RUNS_ROOT)\n", + "print('Started at:', datetime.utcnow().isoformat() + 'Z')\n" + ], "id": "ujweiMJQcLj4" }, { @@ -193,7 +216,144 @@ }, "execution_count": null, "outputs": [], - "source": "import gc\nimport time\nimport json\nimport warnings\nfrom datetime import datetime\nfrom pathlib import Path\nwarnings.filterwarnings('ignore')\n\nimport numpy as np\nimport pandas as pd\nimport xgboost as xgb\nimport torch\nimport weightwatcher as ww\n\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import accuracy_score, log_loss\n\nfrom xgbwwdata import Filters, load_dataset\nfrom xgboost2ww import convert\n\nDRIVE_ROOT = None\nSTATE_PATH = None\nMETRICS_PATH = None\nMODEL_PATH = None\nSPLIT_PATH = None\nSUMMARY_PATH = None\nMODEL_META_PATH = None\n\n\ndef configure_checkpoint_paths(dataset_uid: str):\n global DRIVE_ROOT, STATE_PATH, METRICS_PATH, MODEL_PATH, SPLIT_PATH, SUMMARY_PATH, MODEL_META_PATH\n dataset_id = dataset_uid.split(':', 1)[1]\n DRIVE_ROOT = RUNS_ROOT / f'openml_{dataset_id}_w1w2w7w8w9_alpha'\n DRIVE_ROOT.mkdir(parents=True, exist_ok=True)\n\n STATE_PATH = DRIVE_ROOT / f'state_openml_{dataset_id}.json'\n METRICS_PATH = DRIVE_ROOT / f'metrics_openml_{dataset_id}.csv'\n MODEL_PATH = DRIVE_ROOT / f'model_openml_{dataset_id}_latest.json'\n SPLIT_PATH = DRIVE_ROOT / f'data_split_openml_{dataset_id}.npz'\n SUMMARY_PATH = DRIVE_ROOT / f'summary_openml_{dataset_id}.json'\n MODEL_META_PATH = DRIVE_ROOT / f'model_meta_openml_{dataset_id}.json'\n\n\ndef _extract_weight_shape(layer):\n candidates = []\n\n if hasattr(layer, 'weight'):\n candidates.append(layer.weight)\n\n if hasattr(layer, 'modules'):\n for sublayer in layer.modules():\n if sublayer is layer:\n continue\n if hasattr(sublayer, 'weight'):\n candidates.append(sublayer.weight)\n\n for w in candidates:\n if hasattr(w, 'detach'):\n shape = tuple(w.detach().cpu().shape)\n else:\n shape = tuple(getattr(w, 'shape', ()))\n if len(shape) == 2:\n return shape\n\n return None\n\n\ndef layer_min_matrix_dim(layer):\n shape = _extract_weight_shape(layer)\n if shape is None:\n return 0\n return int(min(shape))\n\n\ndef ww_stats_for_matrix(layer, matrix_name):\n watcher = ww.WeightWatcher(model=layer)\n details = watcher.analyze(randomize=True, detX=True, ERG=True, plot=False)\n if 'alpha' not in details.columns:\n raise RuntimeError(f\"WeightWatcher output missing alpha for {matrix_name}: columns={list(details.columns)}\")\n\n row = details.iloc[0]\n return {\n 'alpha': float(row['alpha']),\n 'D': float(row['D']) if 'D' in details.columns and pd.notna(row.get('D')) else np.nan,\n 'num_traps': float(row['num_traps']) if 'num_traps' in details.columns and pd.notna(row.get('num_traps')) else np.nan,\n 'ERG_gap': float(row['ERG_gap']) if 'ERG_gap' in details.columns and pd.notna(row.get('ERG_gap')) else np.nan,\n }\n\n\ndef convert_matrix_layer(model, Xtr, ytr, matrix_name, train_params=None, num_boost_round=None):\n return convert(\n model,\n Xtr,\n ytr,\n W=matrix_name,\n return_type='torch',\n train_params=train_params,\n num_boost_round=num_boost_round,\n )\n\n\ndef is_small_matrix_error(err):\n msg = str(err)\n return ('minimum of 2 is required by TruncatedSVD' in msg) or ('Found array with 1 feature(s)' in msg)\n\n\ndef evaluate_model(booster, Xtr, ytr, Xte, yte, task_type, n_classes):\n dtr = xgb.DMatrix(Xtr, label=ytr)\n dte = xgb.DMatrix(Xte, label=yte)\n\n if task_type == 'classification' and int(n_classes) > 2:\n p_tr = booster.predict(dtr).astype(np.float32)\n p_te = booster.predict(dte).astype(np.float32)\n\n yhat_tr = np.argmax(p_tr, axis=1)\n yhat_te = np.argmax(p_te, axis=1)\n tr_acc = float(accuracy_score(ytr, yhat_tr))\n te_acc = float(accuracy_score(yte, yhat_te))\n te_loss = float(log_loss(yte, p_te, labels=list(range(int(n_classes)))))\n return tr_acc, te_acc, te_loss\n\n m_tr = booster.predict(dtr, output_margin=True).astype(np.float32)\n p_tr = 1.0 / (1.0 + np.exp(-m_tr))\n tr_acc = float(accuracy_score(ytr, (p_tr >= 0.5).astype(int)))\n\n m_te = booster.predict(dte, output_margin=True).astype(np.float32)\n p_te = 1.0 / (1.0 + np.exp(-m_te))\n te_acc = float(accuracy_score(yte, (p_te >= 0.5).astype(int)))\n te_loss = float(log_loss(yte, np.vstack([1 - p_te, p_te]).T, labels=[0, 1]))\n return tr_acc, te_acc, te_loss\n\n\ndef save_checkpoint(state, metrics_df, booster):\n STATE_PATH.write_text(json.dumps(state, indent=2))\n metrics_df.to_csv(METRICS_PATH, index=False)\n booster.save_model(str(MODEL_PATH))\n", + "source": [ + "import gc\n", + "import time\n", + "import json\n", + "import warnings\n", + "from datetime import datetime\n", + "from pathlib import Path\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import xgboost as xgb\n", + "import torch\n", + "import weightwatcher as ww\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import accuracy_score, log_loss\n", + "\n", + "from xgbwwdata import Filters, load_dataset\n", + "from xgboost2ww import convert\n", + "\n", + "DRIVE_ROOT = None\n", + "STATE_PATH = None\n", + "METRICS_PATH = None\n", + "MODEL_PATH = None\n", + "SPLIT_PATH = None\n", + "SUMMARY_PATH = None\n", + "MODEL_META_PATH = None\n", + "\n", + "\n", + "def configure_checkpoint_paths(dataset_uid: str):\n", + " global DRIVE_ROOT, STATE_PATH, METRICS_PATH, MODEL_PATH, SPLIT_PATH, SUMMARY_PATH, MODEL_META_PATH\n", + " dataset_id = dataset_uid.split(':', 1)[1]\n", + " DRIVE_ROOT = RUNS_ROOT / f'openml_{dataset_id}_w1w2w7w8w9'\n", + " DRIVE_ROOT.mkdir(parents=True, exist_ok=True)\n", + "\n", + " STATE_PATH = DRIVE_ROOT / 'state.json'\n", + " METRICS_PATH = DRIVE_ROOT / 'metrics.csv'\n", + " MODEL_PATH = DRIVE_ROOT / 'model_latest.json'\n", + " SPLIT_PATH = DRIVE_ROOT / 'data_split.npz'\n", + " SUMMARY_PATH = DRIVE_ROOT / 'summary.json'\n", + " MODEL_META_PATH = DRIVE_ROOT / 'model_meta.json'\n", + "\n", + "\n", + "def _extract_weight_shape(layer):\n", + " candidates = []\n", + "\n", + " if hasattr(layer, 'weight'):\n", + " candidates.append(layer.weight)\n", + "\n", + " if hasattr(layer, 'modules'):\n", + " for sublayer in layer.modules():\n", + " if sublayer is layer:\n", + " continue\n", + " if hasattr(sublayer, 'weight'):\n", + " candidates.append(sublayer.weight)\n", + "\n", + " for w in candidates:\n", + " if hasattr(w, 'detach'):\n", + " shape = tuple(w.detach().cpu().shape)\n", + " else:\n", + " shape = tuple(getattr(w, 'shape', ()))\n", + " if len(shape) == 2:\n", + " return shape\n", + "\n", + " return None\n", + "\n", + "\n", + "def layer_min_matrix_dim(layer):\n", + " shape = _extract_weight_shape(layer)\n", + " if shape is None:\n", + " return 0\n", + " return int(min(shape))\n", + "\n", + "\n", + "def ww_stats_for_matrix(layer, matrix_name):\n", + " watcher = ww.WeightWatcher(model=layer)\n", + " details = watcher.analyze(randomize=True, detX=True, ERG=True, plot=False)\n", + " if 'alpha' not in details.columns:\n", + " raise RuntimeError(f\"WeightWatcher output missing alpha for {matrix_name}: columns={list(details.columns)}\")\n", + "\n", + " row = details.iloc[0]\n", + " return {\n", + " 'alpha': float(row['alpha']),\n", + " 'D': float(row['D']) if 'D' in details.columns and pd.notna(row.get('D')) else np.nan,\n", + " 'num_traps': float(row['num_traps']) if 'num_traps' in details.columns and pd.notna(row.get('num_traps')) else np.nan,\n", + " 'ERG_gap': float(row['ERG_gap']) if 'ERG_gap' in details.columns and pd.notna(row.get('ERG_gap')) else np.nan,\n", + " }\n", + "\n", + "\n", + "def convert_matrix_layer(model, Xtr, ytr, matrix_name, train_params=None, num_boost_round=None):\n", + " return convert(\n", + " model,\n", + " Xtr,\n", + " ytr,\n", + " W=matrix_name,\n", + " return_type='torch',\n", + " train_params=train_params,\n", + " num_boost_round=num_boost_round,\n", + " )\n", + "\n", + "\n", + "def is_small_matrix_error(err):\n", + " msg = str(err)\n", + " return ('minimum of 2 is required by TruncatedSVD' in msg) or ('Found array with 1 feature(s)' in msg)\n", + "\n", + "\n", + "def evaluate_model(booster, Xtr, ytr, Xte, yte, task_type, n_classes):\n", + " dtr = xgb.DMatrix(Xtr, label=ytr)\n", + " dte = xgb.DMatrix(Xte, label=yte)\n", + "\n", + " if task_type == 'classification' and int(n_classes) > 2:\n", + " p_tr = booster.predict(dtr).astype(np.float32)\n", + " p_te = booster.predict(dte).astype(np.float32)\n", + "\n", + " yhat_tr = np.argmax(p_tr, axis=1)\n", + " yhat_te = np.argmax(p_te, axis=1)\n", + " tr_acc = float(accuracy_score(ytr, yhat_tr))\n", + " te_acc = float(accuracy_score(yte, yhat_te))\n", + " te_loss = float(log_loss(yte, p_te, labels=list(range(int(n_classes)))))\n", + " return tr_acc, te_acc, te_loss\n", + "\n", + " m_tr = booster.predict(dtr, output_margin=True).astype(np.float32)\n", + " p_tr = 1.0 / (1.0 + np.exp(-m_tr))\n", + " tr_acc = float(accuracy_score(ytr, (p_tr >= 0.5).astype(int)))\n", + "\n", + " m_te = booster.predict(dte, output_margin=True).astype(np.float32)\n", + " p_te = 1.0 / (1.0 + np.exp(-m_te))\n", + " te_acc = float(accuracy_score(yte, (p_te >= 0.5).astype(int)))\n", + " te_loss = float(log_loss(yte, np.vstack([1 - p_te, p_te]).T, labels=[0, 1]))\n", + " return tr_acc, te_acc, te_loss\n", + "\n", + "\n", + "def save_checkpoint(state, metrics_df, booster):\n", + " STATE_PATH.write_text(json.dumps(state, indent=2))\n", + " metrics_df.to_csv(METRICS_PATH, index=False)\n", + " booster.save_model(str(MODEL_PATH))\n" + ], "id": "ucDV8sskcLj5" }, {