Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
166 changes: 163 additions & 3 deletions XGBWW_OpenML_W1W2W7W8W9_for_openml_id.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
},
"source": "<a href=\"https://colab.research.google.com/github/CalculatedContent/xgbwwdata/blob/main/XGBWW_OpenML_W1W2W7W8W9_for_openml_id.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
},

{
"cell_type": "markdown",
"metadata": {
Expand Down Expand Up @@ -48,7 +47,31 @@
]
}
],
"source": "from pathlib import Path\nfrom datetime import datetime\nimport os\n\nfrom google.colab import drive\n\ndrive.mount('/content/drive', force_remount=False)\n\nRUN_NAME_BASE = \"openml_w1w2w7w8w9_for_openml_id\"\nRUNS_ROOT = Path('/content/drive/MyDrive/xgbww_runs') / RUN_NAME_BASE\nRUNS_ROOT.mkdir(parents=True, exist_ok=True)\n\nCHECKPOINT_EVERY_ROUNDS = 1 # evaluate/save every N rounds\nMAX_ROUNDS = 10000 # safety cap\nTARGET_ALPHA = 1.5 # stop when all tracked W-matrix alphas <= this value\nTEST_SIZE = 0.2\nRANDOM_STATE = 42\nFORCE_FRESH_START = True # True = ignore prior checkpoints and start over for selected dataset\nRESTART_RUNTIME_AFTER_INSTALL = False\nREUSE_LAST_MODEL = False # True = resume last selected OpenML dataset\nOPENML_MODEL_ID_INPUT = \"RANDOM\" # Set to RANDOM or an OpenML dataset/model id (e.g., 1049)\n\nprint('Runs root:', RUNS_ROOT)\nprint('Started at:', datetime.utcnow().isoformat() + 'Z')\n",
"source": [
"from pathlib import Path\n",
"from datetime import datetime\n",
"import os\n",
"\n",
"from google.colab import drive\n",
"\n",
"drive.mount('/content/drive', force_remount=False)\n",
"\n",
"RUNS_ROOT = Path('/content/drive/MyDrive/xgbww_runs')\n",
"RUNS_ROOT.mkdir(parents=True, exist_ok=True)\n",
"\n",
"CHECKPOINT_EVERY_ROUNDS = 1 # evaluate/save every N rounds\n",
"MAX_ROUNDS = 10000 # safety cap\n",
"TARGET_ALPHA = 1.5 # stop when all tracked W-matrix alphas <= this value\n",
"TEST_SIZE = 0.2\n",
"RANDOM_STATE = 42\n",
"FORCE_FRESH_START = True # True = ignore prior checkpoints and start over for selected dataset\n",
"RESTART_RUNTIME_AFTER_INSTALL = False\n",
"REUSE_LAST_MODEL = False # True = resume last selected OpenML dataset\n",
"OPENML_MODEL_ID_INPUT = \"RANDOM\" # Set to RANDOM or an OpenML dataset/model id (e.g., 1049)\n",
"\n",
"print('Runs root:', RUNS_ROOT)\n",
"print('Started at:', datetime.utcnow().isoformat() + 'Z')\n"
],
"id": "ujweiMJQcLj4"
},
{
Expand Down Expand Up @@ -193,7 +216,144 @@
},
"execution_count": null,
"outputs": [],
"source": "import gc\nimport time\nimport json\nimport warnings\nfrom datetime import datetime\nfrom pathlib import Path\nwarnings.filterwarnings('ignore')\n\nimport numpy as np\nimport pandas as pd\nimport xgboost as xgb\nimport torch\nimport weightwatcher as ww\n\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import accuracy_score, log_loss\n\nfrom xgbwwdata import Filters, load_dataset\nfrom xgboost2ww import convert\n\nDRIVE_ROOT = None\nSTATE_PATH = None\nMETRICS_PATH = None\nMODEL_PATH = None\nSPLIT_PATH = None\nSUMMARY_PATH = None\nMODEL_META_PATH = None\n\n\ndef configure_checkpoint_paths(dataset_uid: str):\n global DRIVE_ROOT, STATE_PATH, METRICS_PATH, MODEL_PATH, SPLIT_PATH, SUMMARY_PATH, MODEL_META_PATH\n dataset_id = dataset_uid.split(':', 1)[1]\n DRIVE_ROOT = RUNS_ROOT / f'openml_{dataset_id}_w1w2w7w8w9_alpha'\n DRIVE_ROOT.mkdir(parents=True, exist_ok=True)\n\n STATE_PATH = DRIVE_ROOT / f'state_openml_{dataset_id}.json'\n METRICS_PATH = DRIVE_ROOT / f'metrics_openml_{dataset_id}.csv'\n MODEL_PATH = DRIVE_ROOT / f'model_openml_{dataset_id}_latest.json'\n SPLIT_PATH = DRIVE_ROOT / f'data_split_openml_{dataset_id}.npz'\n SUMMARY_PATH = DRIVE_ROOT / f'summary_openml_{dataset_id}.json'\n MODEL_META_PATH = DRIVE_ROOT / f'model_meta_openml_{dataset_id}.json'\n\n\ndef _extract_weight_shape(layer):\n candidates = []\n\n if hasattr(layer, 'weight'):\n candidates.append(layer.weight)\n\n if hasattr(layer, 'modules'):\n for sublayer in layer.modules():\n if sublayer is layer:\n continue\n if hasattr(sublayer, 'weight'):\n candidates.append(sublayer.weight)\n\n for w in candidates:\n if hasattr(w, 'detach'):\n shape = tuple(w.detach().cpu().shape)\n else:\n shape = tuple(getattr(w, 'shape', ()))\n if len(shape) == 2:\n return shape\n\n return None\n\n\ndef layer_min_matrix_dim(layer):\n shape = _extract_weight_shape(layer)\n if shape is None:\n return 0\n return int(min(shape))\n\n\ndef ww_stats_for_matrix(layer, matrix_name):\n watcher = ww.WeightWatcher(model=layer)\n details = watcher.analyze(randomize=True, detX=True, ERG=True, plot=False)\n if 'alpha' not in details.columns:\n raise RuntimeError(f\"WeightWatcher output missing alpha for {matrix_name}: columns={list(details.columns)}\")\n\n row = details.iloc[0]\n return {\n 'alpha': float(row['alpha']),\n 'D': float(row['D']) if 'D' in details.columns and pd.notna(row.get('D')) else np.nan,\n 'num_traps': float(row['num_traps']) if 'num_traps' in details.columns and pd.notna(row.get('num_traps')) else np.nan,\n 'ERG_gap': float(row['ERG_gap']) if 'ERG_gap' in details.columns and pd.notna(row.get('ERG_gap')) else np.nan,\n }\n\n\ndef convert_matrix_layer(model, Xtr, ytr, matrix_name, train_params=None, num_boost_round=None):\n return convert(\n model,\n Xtr,\n ytr,\n W=matrix_name,\n return_type='torch',\n train_params=train_params,\n num_boost_round=num_boost_round,\n )\n\n\ndef is_small_matrix_error(err):\n msg = str(err)\n return ('minimum of 2 is required by TruncatedSVD' in msg) or ('Found array with 1 feature(s)' in msg)\n\n\ndef evaluate_model(booster, Xtr, ytr, Xte, yte, task_type, n_classes):\n dtr = xgb.DMatrix(Xtr, label=ytr)\n dte = xgb.DMatrix(Xte, label=yte)\n\n if task_type == 'classification' and int(n_classes) > 2:\n p_tr = booster.predict(dtr).astype(np.float32)\n p_te = booster.predict(dte).astype(np.float32)\n\n yhat_tr = np.argmax(p_tr, axis=1)\n yhat_te = np.argmax(p_te, axis=1)\n tr_acc = float(accuracy_score(ytr, yhat_tr))\n te_acc = float(accuracy_score(yte, yhat_te))\n te_loss = float(log_loss(yte, p_te, labels=list(range(int(n_classes)))))\n return tr_acc, te_acc, te_loss\n\n m_tr = booster.predict(dtr, output_margin=True).astype(np.float32)\n p_tr = 1.0 / (1.0 + np.exp(-m_tr))\n tr_acc = float(accuracy_score(ytr, (p_tr >= 0.5).astype(int)))\n\n m_te = booster.predict(dte, output_margin=True).astype(np.float32)\n p_te = 1.0 / (1.0 + np.exp(-m_te))\n te_acc = float(accuracy_score(yte, (p_te >= 0.5).astype(int)))\n te_loss = float(log_loss(yte, np.vstack([1 - p_te, p_te]).T, labels=[0, 1]))\n return tr_acc, te_acc, te_loss\n\n\ndef save_checkpoint(state, metrics_df, booster):\n STATE_PATH.write_text(json.dumps(state, indent=2))\n metrics_df.to_csv(METRICS_PATH, index=False)\n booster.save_model(str(MODEL_PATH))\n",
"source": [
"import gc\n",
"import time\n",
"import json\n",
"import warnings\n",
"from datetime import datetime\n",
"from pathlib import Path\n",
"warnings.filterwarnings('ignore')\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"import xgboost as xgb\n",
"import torch\n",
"import weightwatcher as ww\n",
"\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import accuracy_score, log_loss\n",
"\n",
"from xgbwwdata import Filters, load_dataset\n",
"from xgboost2ww import convert\n",
"\n",
"DRIVE_ROOT = None\n",
"STATE_PATH = None\n",
"METRICS_PATH = None\n",
"MODEL_PATH = None\n",
"SPLIT_PATH = None\n",
"SUMMARY_PATH = None\n",
"MODEL_META_PATH = None\n",
"\n",
"\n",
"def configure_checkpoint_paths(dataset_uid: str):\n",
" global DRIVE_ROOT, STATE_PATH, METRICS_PATH, MODEL_PATH, SPLIT_PATH, SUMMARY_PATH, MODEL_META_PATH\n",
" dataset_id = dataset_uid.split(':', 1)[1]\n",
" DRIVE_ROOT = RUNS_ROOT / f'openml_{dataset_id}_w1w2w7w8w9'\n",
" DRIVE_ROOT.mkdir(parents=True, exist_ok=True)\n",
"\n",
" STATE_PATH = DRIVE_ROOT / 'state.json'\n",
" METRICS_PATH = DRIVE_ROOT / 'metrics.csv'\n",
" MODEL_PATH = DRIVE_ROOT / 'model_latest.json'\n",
" SPLIT_PATH = DRIVE_ROOT / 'data_split.npz'\n",
" SUMMARY_PATH = DRIVE_ROOT / 'summary.json'\n",
" MODEL_META_PATH = DRIVE_ROOT / 'model_meta.json'\n",
"\n",
"\n",
"def _extract_weight_shape(layer):\n",
" candidates = []\n",
"\n",
" if hasattr(layer, 'weight'):\n",
" candidates.append(layer.weight)\n",
"\n",
" if hasattr(layer, 'modules'):\n",
" for sublayer in layer.modules():\n",
" if sublayer is layer:\n",
" continue\n",
" if hasattr(sublayer, 'weight'):\n",
" candidates.append(sublayer.weight)\n",
"\n",
" for w in candidates:\n",
" if hasattr(w, 'detach'):\n",
" shape = tuple(w.detach().cpu().shape)\n",
" else:\n",
" shape = tuple(getattr(w, 'shape', ()))\n",
" if len(shape) == 2:\n",
" return shape\n",
"\n",
" return None\n",
"\n",
"\n",
"def layer_min_matrix_dim(layer):\n",
" shape = _extract_weight_shape(layer)\n",
" if shape is None:\n",
" return 0\n",
" return int(min(shape))\n",
"\n",
"\n",
"def ww_stats_for_matrix(layer, matrix_name):\n",
" watcher = ww.WeightWatcher(model=layer)\n",
" details = watcher.analyze(randomize=True, detX=True, ERG=True, plot=False)\n",
" if 'alpha' not in details.columns:\n",
" raise RuntimeError(f\"WeightWatcher output missing alpha for {matrix_name}: columns={list(details.columns)}\")\n",
"\n",
" row = details.iloc[0]\n",
" return {\n",
" 'alpha': float(row['alpha']),\n",
" 'D': float(row['D']) if 'D' in details.columns and pd.notna(row.get('D')) else np.nan,\n",
" 'num_traps': float(row['num_traps']) if 'num_traps' in details.columns and pd.notna(row.get('num_traps')) else np.nan,\n",
" 'ERG_gap': float(row['ERG_gap']) if 'ERG_gap' in details.columns and pd.notna(row.get('ERG_gap')) else np.nan,\n",
" }\n",
"\n",
"\n",
"def convert_matrix_layer(model, Xtr, ytr, matrix_name, train_params=None, num_boost_round=None):\n",
" return convert(\n",
" model,\n",
" Xtr,\n",
" ytr,\n",
" W=matrix_name,\n",
" return_type='torch',\n",
" train_params=train_params,\n",
" num_boost_round=num_boost_round,\n",
" )\n",
"\n",
"\n",
"def is_small_matrix_error(err):\n",
" msg = str(err)\n",
" return ('minimum of 2 is required by TruncatedSVD' in msg) or ('Found array with 1 feature(s)' in msg)\n",
"\n",
"\n",
"def evaluate_model(booster, Xtr, ytr, Xte, yte, task_type, n_classes):\n",
" dtr = xgb.DMatrix(Xtr, label=ytr)\n",
" dte = xgb.DMatrix(Xte, label=yte)\n",
"\n",
" if task_type == 'classification' and int(n_classes) > 2:\n",
" p_tr = booster.predict(dtr).astype(np.float32)\n",
" p_te = booster.predict(dte).astype(np.float32)\n",
"\n",
" yhat_tr = np.argmax(p_tr, axis=1)\n",
" yhat_te = np.argmax(p_te, axis=1)\n",
" tr_acc = float(accuracy_score(ytr, yhat_tr))\n",
" te_acc = float(accuracy_score(yte, yhat_te))\n",
" te_loss = float(log_loss(yte, p_te, labels=list(range(int(n_classes)))))\n",
" return tr_acc, te_acc, te_loss\n",
"\n",
" m_tr = booster.predict(dtr, output_margin=True).astype(np.float32)\n",
" p_tr = 1.0 / (1.0 + np.exp(-m_tr))\n",
" tr_acc = float(accuracy_score(ytr, (p_tr >= 0.5).astype(int)))\n",
"\n",
" m_te = booster.predict(dte, output_margin=True).astype(np.float32)\n",
" p_te = 1.0 / (1.0 + np.exp(-m_te))\n",
" te_acc = float(accuracy_score(yte, (p_te >= 0.5).astype(int)))\n",
" te_loss = float(log_loss(yte, np.vstack([1 - p_te, p_te]).T, labels=[0, 1]))\n",
" return tr_acc, te_acc, te_loss\n",
"\n",
"\n",
"def save_checkpoint(state, metrics_df, booster):\n",
" STATE_PATH.write_text(json.dumps(state, indent=2))\n",
" metrics_df.to_csv(METRICS_PATH, index=False)\n",
" booster.save_model(str(MODEL_PATH))\n"
],
"id": "ucDV8sskcLj5"
},
{
Expand Down