From c6d599e39b60dc9b991bd7757123e6cbe377e75b Mon Sep 17 00:00:00 2001
From: xuanchen-liu-97 <xuanchen.liu.97@gmail.com>
Date: Tue, 18 Nov 2025 12:32:11 +0000
Subject: [PATCH 01/14] Cleaning chords data

1. Find the segments of chords based on the <verse> label.
2. Inference the tone for each segment.
3. Transpose the chords into the Roman numeral expression.
---
 TDL-chords-data-cleaning.ipynb | 902 +++++++++++++++++++++++++++++++++
 1 file changed, 902 insertions(+)
 create mode 100644 TDL-chords-data-cleaning.ipynb
diff --git a/TDL-chords-data-cleaning.ipynb b/TDL-chords-data-cleaning.ipynb
new file mode 100644
index 00000000..43b7fc87
--- /dev/null
+++ b/TDL-chords-data-cleaning.ipynb
@@ -0,0 +1,902 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "cc404196-bd66-43b2-8f3b-9602f6ccf58f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import re\n",
+    "from collections import defaultdict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aeaa2ed4-6d9e-4cd7-90f3-402cd3e92440",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Uun this code when using it for the first time for loading the dataset\n",
+    "# pip install huggingface_hub"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "94d6aced-7399-4917-bd60-a3997d5831af",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\liu.xuanc\\AppData\\Local\\Temp\\ipykernel_17284\\2761146993.py:1: DtypeWarning: Columns (2,3,5,6,7,8,9) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+      "  df = pd.read_csv(\"hf://datasets/ailsntua/Chordonomicon/chordonomicon_v2.csv\")\n"
+     ]
+    }
+   ],
+   "source": [
+    "df = pd.read_csv(\"hf://datasets/ailsntua/Chordonomicon/chordonomicon_v2.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "c37fa55f-b1b2-42f1-b827-dfe8dc5fb04a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# def clean_chords(chord_string):\n",
+    "#     \"\"\"\n",
+    "#     Clean chord strings, remove structural labels (such as<intro_1>,<verse_1>, etc.),，\n",
+    "    \n",
+    "#     Parameters:\n",
+    "#         Chord_string: A string containing labels and chords\n",
+    "        \n",
+    "#     return:\n",
+    "#         Chords progression: List\n",
+    "#     \"\"\"\n",
+    "#     if pd.isna(chord_string) or chord_string == '':\n",
+    "#         return []\n",
+    "    \n",
+    "#     # delete <...> lable\n",
+    "#     cleaned = re.sub(r'<[^>]+>', '', chord_string)\n",
+    "    \n",
+    "#     # delete other string\n",
+    "#     chords = [chord.strip() for chord in cleaned.split() if chord.strip()]\n",
+    "    \n",
+    "#     return chords"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "id": "13de4c2d-7941-4f47-bb9a-4c5e2774eeb2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "NOTES = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']\n",
+    "\n",
+    "# A simple homophone conversion dictionary (for standardized input)\n",
+    "FLAT_TO_SHARP = {\n",
+    "    'Db': 'C#', 'Eb': 'D#', 'Gb': 'F#', 'Ab': 'G#', 'Bb': 'A#',\n",
+    "    'db': 'C#', 'eb': 'D#', 'gb': 'F#', 'ab': 'G#', 'bb': 'A#'\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "a674a0eb-9cc9-46fd-aa74-6d9ce1575e84",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ==========================================\n",
+    "# Global configuration and auxiliary functions\n",
+    "# ==========================================\n",
+    "\n",
+    "def parse_chord_root_quality(chord_str):\n",
+    "    \"\"\"\n",
+    "    Auxiliary function: parses a single chord string and returns (Root, Quality)\n",
+    "    For example: \"Am\" ->(\"A\", \"min\"), \"G7\" ->(\"G\", \"7\")\n",
+    "    \"\"\"\n",
+    "    chord_str = chord_str.strip()\n",
+    "    if not chord_str: return None, None\n",
+    "    \n",
+    "    for flat, sharp in FLAT_TO_SHARP.items():\n",
+    "        if chord_str.startswith(flat):\n",
+    "            chord_str = sharp + chord_str[len(flat):]\n",
+    "            break\n",
+    "    if len(chord_str) > 1 and chord_str[1] == 's': # Fs7\n",
+    "         chord_str = chord_str[0] + '#' + chord_str[2:]\n",
+    "         \n",
+    "    match = re.match(r'([A-G]#?)(.*)', chord_str)\n",
+    "    if not match: return None, None\n",
+    "    \n",
+    "    root = match.group(1)\n",
+    "    rest = match.group(2).lower()\n",
+    "    \n",
+    "    # Characteristic\n",
+    "    if 'dim' in rest: quality = 'dim'\n",
+    "    elif 'min' in rest or 'm' == rest: quality = 'min'\n",
+    "    elif '7' in rest and 'maj' not in rest and 'min' not in rest: quality = '7'\n",
+    "    else: quality = 'maj' # Default to major triad/major seventh chord\n",
+    "    \n",
+    "    return root, quality"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "4cc94eec-4f9d-43dd-ac98-5a852ad494c7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ==========================================\n",
+    "# Segmentation\n",
+    "# ==========================================\n",
+    "\n",
+    "def segment_music_string(raw_string):\n",
+    "    \"\"\"\n",
+    "    Input: '<intro_1> C D <verse_1> F G'\n",
+    "    Output: [{'label': 'intro_1', 'chords': ['C', 'D']}, {'label': 'verse_1', 'chords': ['F', 'G']}]\n",
+    "    \"\"\"\n",
+    "    parts = re.split(r'(<[^>]+>)', raw_string)\n",
+    "    \n",
+    "    segments = []\n",
+    "    current_label = \"Unknown\"\n",
+    "    \n",
+    "    for part in parts:\n",
+    "        part = part.strip()\n",
+    "        if not part: continue\n",
+    "        \n",
+    "        # Label (< xxx，> )\n",
+    "        if part.startswith('<') and part.endswith('>'):\n",
+    "            current_label = part.strip('<>')\n",
+    "        else:\n",
+    "            # Content（Chords）\n",
+    "            chord_list = part.split()\n",
+    "            if chord_list:\n",
+    "                segments.append({\n",
+    "                    'label': current_label,\n",
+    "                    'chords': chord_list\n",
+    "                })\n",
+    "                \n",
+    "    return segments"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "f94b2dca-8469-4ec3-bcf3-b4a7fd0886ce",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ==========================================\n",
+    "# Key Inference\n",
+    "# ==========================================\n",
+    "\n",
+    "KEY_TEMPLATES = {}\n",
+    "def _build_templates():\n",
+    "    for i in range(12):\n",
+    "        root = NOTES[i]\n",
+    "        # Major key template\n",
+    "        s_maj = [NOTES[(i + n) % 12] for n in [0, 2, 4, 5, 7, 9, 11]]\n",
+    "        maj_chords = {s_maj[0]:['maj'], s_maj[1]:['min'], s_maj[2]:['min'], s_maj[3]:['maj'], s_maj[4]:['maj','7'], s_maj[5]:['min'], s_maj[6]:['dim']}\n",
+    "        KEY_TEMPLATES[f\"{root} Major\"] = (maj_chords, s_maj[0], s_maj[4]) # (Template, main tone, subordinate tone)\n",
+    "        \n",
+    "        # Minor Tune Template (Natural+Harmony)\n",
+    "        s_min = [NOTES[(i + n) % 12] for n in [0, 2, 3, 5, 7, 8, 10]]\n",
+    "        dom_root = NOTES[(i + 7) % 12]\n",
+    "        min_chords = {s_min[0]:['min'], s_min[1]:['dim'], s_min[2]:['maj'], s_min[3]:['min'], s_min[4]:['min'], dom_root:['maj','7'], s_min[5]:['maj'], s_min[6]:['maj','7']}\n",
+    "        KEY_TEMPLATES[f\"{root} Minor\"] = (min_chords, s_min[0], dom_root)\n",
+    "_build_templates()\n",
+    "\n",
+    "def infer_key_from_list(chord_list):\n",
+    "    \"\"\"\n",
+    "    input: ['C', 'F', 'G7', 'C']\n",
+    "    output: 'C Major'\n",
+    "    \"\"\"\n",
+    "    if not chord_list: return \"Unknown\"\n",
+    "    \n",
+    "    parsed_data = []\n",
+    "    for idx, c_str in enumerate(chord_list):\n",
+    "        r, q = parse_chord_root_quality(c_str)\n",
+    "        if r:\n",
+    "            next_r = None\n",
+    "            if idx + 1 < len(chord_list):\n",
+    "                next_r, _ = parse_chord_root_quality(chord_list[idx+1])\n",
+    "            parsed_data.append((r, q, next_r))\n",
+    "            \n",
+    "    scores = defaultdict(int)\n",
+    "    \n",
+    "    # calculate the score for each\n",
+    "    for key_name, (template, tonic, dom) in KEY_TEMPLATES.items():\n",
+    "        score = 0\n",
+    "        for root, quality, next_root in parsed_data:\n",
+    "            # Basic mathc\n",
+    "            if root in template:\n",
+    "                score += 1\n",
+    "                if quality in template[root]:\n",
+    "                    score += 2\n",
+    "            else:\n",
+    "                score -= 1\n",
+    "            \n",
+    "            if root == dom and quality == '7' and next_root == tonic:\n",
+    "                score += 5\n",
+    "                \n",
+    "            if root == tonic:\n",
+    "                score += 1\n",
+    "                \n",
+    "        scores[key_name] = score\n",
+    "        \n",
+    "    if not scores: return \"Unknown\"\n",
+    "    return max(scores, key=scores.get)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "681f297a-1520-4f5f-bf33-7a9af7b5ecf8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def patch_short_segments(analyzed_segments, min_chords=2):\n",
+    "    \"\"\"\n",
+    "    Parameters:\n",
+    "    analyzed_segments: list, {'label':..., 'chords':..., 'key':...}\n",
+    "    min_chords: threshold，chords less than this quantity will be corrected (default is 2)\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    # Scan twice\n",
+    "    \n",
+    "    # orward Pass\n",
+    "    for i in range(1, len(analyzed_segments)):\n",
+    "        current_seg = analyzed_segments[i]\n",
+    "        prev_seg = analyzed_segments[i-1]\n",
+    "        \n",
+    "        if len(current_seg['chords']) < min_chords:\n",
+    "            if prev_seg['key'] != \"Unknown\":\n",
+    "                current_seg['key'] = prev_seg['key']\n",
+    "                current_seg['key_source'] = 'borrowed_prev' \n",
+    "\n",
+    "    # Backward Pass\n",
+    "    for i in range(len(analyzed_segments) - 2, -1, -1):\n",
+    "        current_seg = analyzed_segments[i]\n",
+    "        next_seg = analyzed_segments[i+1]\n",
+    "        \n",
+    "        if len(current_seg['chords']) < min_chords:\n",
+    "            # (Usually Intro should follow Verse's tone)\n",
+    "            if next_seg['key'] != \"Unknown\":\n",
+    "                 current_seg['key'] = next_seg['key']\n",
+    "                 current_seg['key_source'] = 'borrowed_next'\n",
+    "                 \n",
+    "    return analyzed_segments"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "795661aa-e36a-40b3-8d4e-6bdd5e216566",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ==========================================\n",
+    "# Roman Numeral Conversion\n",
+    "# ==========================================\n",
+    "ROMAN_MAP = {\n",
+    "    0: 'I', 1: 'bII', 2: 'II', 3: 'bIII', 4: 'III', 5: 'IV', \n",
+    "    6: 'bV', 7: 'V', 8: 'bVI', 9: 'VI', 10: 'bVII', 11: 'VII'\n",
+    "}\n",
+    "\n",
+    "def convert_to_roman(chord_list, key_str):\n",
+    "    \"\"\"\n",
+    "    input: chord_list=['C', 'F', 'G'], key_str='C Major'\n",
+    "    output: ['I', 'IV', 'V']\n",
+    "    \"\"\"\n",
+    "    if key_str == \"Unknown\":\n",
+    "        return chord_list\n",
+    "    \n",
+    "    # Analyze the main tone of tonality\n",
+    "    key_root_str = key_str.split()[0] # \"C Major\" -> \"C\"\n",
+    "    is_minor_key = \"Minor\" in key_str\n",
+    "    \n",
+    "    if key_root_str not in NOTES: return chord_list\n",
+    "    key_root_idx = NOTES.index(key_root_str)\n",
+    "    \n",
+    "    roman_output = []\n",
+    "    \n",
+    "    for chord_str in chord_list:\n",
+    "        root, quality = parse_chord_root_quality(chord_str)\n",
+    "        if not root:\n",
+    "            roman_output.append(\"?\")\n",
+    "            continue\n",
+    "            \n",
+    "        # Calculate Interval\n",
+    "        # (Root note of chord - tonic tonic) % 12\n",
+    "        root_idx = NOTES.index(root)\n",
+    "        interval = (root_idx - key_root_idx) % 12\n",
+    "        \n",
+    "        base_roman = ROMAN_MAP.get(interval, \"?\")\n",
+    "        \n",
+    "        # Minor triad (min) or subtract triad (dim) -> Lowercase\n",
+    "        if quality == 'min' or quality == 'dim':\n",
+    "            final_roman = base_roman.lower()\n",
+    "        else:\n",
+    "            final_roman = base_roman\n",
+    "            \n",
+    "        # Suffix\n",
+    "        if quality == '7':\n",
+    "            final_roman += '7'\n",
+    "        elif quality == 'dim':\n",
+    "            final_roman += '°'\n",
+    "            \n",
+    "        roman_output.append(final_roman)\n",
+    "        \n",
+    "    return roman_output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9866344c-1635-4210-ab52-273b47da3af1",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "258f2c66-854d-4eef-9289-8025ac837ad4",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'<intro_1> C <verse_1> F C E7 Amin C F C G7 C F C E7 Amin C F G7 C <verse_2> F C E7 Amin C F C G7 C F C E7 Amin C F G7 C <chorus_1> F C F C G C F C E7 Amin C F G7 C <solo_1> D <chorus_2> G D G D A D G D Fs7 Bmin D G A7 D G A7 D'"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df['chords'][0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "be20964f-09f4-4d5c-8376-d2c86b3eb90b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'label': 'intro_1', 'chords': ['C']},\n",
+       " {'label': 'verse_1',\n",
+       "  'chords': ['F',\n",
+       "   'C',\n",
+       "   'E7',\n",
+       "   'Amin',\n",
+       "   'C',\n",
+       "   'F',\n",
+       "   'C',\n",
+       "   'G7',\n",
+       "   'C',\n",
+       "   'F',\n",
+       "   'C',\n",
+       "   'E7',\n",
+       "   'Amin',\n",
+       "   'C',\n",
+       "   'F',\n",
+       "   'G7',\n",
+       "   'C']},\n",
+       " {'label': 'verse_2',\n",
+       "  'chords': ['F',\n",
+       "   'C',\n",
+       "   'E7',\n",
+       "   'Amin',\n",
+       "   'C',\n",
+       "   'F',\n",
+       "   'C',\n",
+       "   'G7',\n",
+       "   'C',\n",
+       "   'F',\n",
+       "   'C',\n",
+       "   'E7',\n",
+       "   'Amin',\n",
+       "   'C',\n",
+       "   'F',\n",
+       "   'G7',\n",
+       "   'C']},\n",
+       " {'label': 'chorus_1',\n",
+       "  'chords': ['F',\n",
+       "   'C',\n",
+       "   'F',\n",
+       "   'C',\n",
+       "   'G',\n",
+       "   'C',\n",
+       "   'F',\n",
+       "   'C',\n",
+       "   'E7',\n",
+       "   'Amin',\n",
+       "   'C',\n",
+       "   'F',\n",
+       "   'G7',\n",
+       "   'C']},\n",
+       " {'label': 'solo_1', 'chords': ['D']},\n",
+       " {'label': 'chorus_2',\n",
+       "  'chords': ['G',\n",
+       "   'D',\n",
+       "   'G',\n",
+       "   'D',\n",
+       "   'A',\n",
+       "   'D',\n",
+       "   'G',\n",
+       "   'D',\n",
+       "   'Fs7',\n",
+       "   'Bmin',\n",
+       "   'D',\n",
+       "   'G',\n",
+       "   'A7',\n",
+       "   'D',\n",
+       "   'G',\n",
+       "   'A7',\n",
+       "   'D']}]"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "segments = segment_music_string(df['chords'][0])\n",
+    "segments"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "df725e56-0714-40d6-93e1-8f61b983e85d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "analyzed_records = []\n",
+    "for seg in segments:\n",
+    "    inferred_key = infer_key_from_list(seg['chords'])\n",
+    "    analyzed_records.append({\n",
+    "        'label': seg['label'],\n",
+    "        'chords': seg['chords'],\n",
+    "        'key': inferred_key\n",
+    "    })"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "578b5a47-b5b3-4b6c-a66e-9dc754b0c686",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'label': 'intro_1', 'chords': ['C'], 'key': 'C Major'},\n",
+       " {'label': 'verse_1',\n",
+       "  'chords': ['F',\n",
+       "   'C',\n",
+       "   'E7',\n",
+       "   'Amin',\n",
+       "   'C',\n",
+       "   'F',\n",
+       "   'C',\n",
+       "   'G7',\n",
+       "   'C',\n",
+       "   'F',\n",
+       "   'C',\n",
+       "   'E7',\n",
+       "   'Amin',\n",
+       "   'C',\n",
+       "   'F',\n",
+       "   'G7',\n",
+       "   'C'],\n",
+       "  'key': 'C Major'},\n",
+       " {'label': 'verse_2',\n",
+       "  'chords': ['F',\n",
+       "   'C',\n",
+       "   'E7',\n",
+       "   'Amin',\n",
+       "   'C',\n",
+       "   'F',\n",
+       "   'C',\n",
+       "   'G7',\n",
+       "   'C',\n",
+       "   'F',\n",
+       "   'C',\n",
+       "   'E7',\n",
+       "   'Amin',\n",
+       "   'C',\n",
+       "   'F',\n",
+       "   'G7',\n",
+       "   'C'],\n",
+       "  'key': 'C Major'},\n",
+       " {'label': 'chorus_1',\n",
+       "  'chords': ['F',\n",
+       "   'C',\n",
+       "   'F',\n",
+       "   'C',\n",
+       "   'G',\n",
+       "   'C',\n",
+       "   'F',\n",
+       "   'C',\n",
+       "   'E7',\n",
+       "   'Amin',\n",
+       "   'C',\n",
+       "   'F',\n",
+       "   'G7',\n",
+       "   'C'],\n",
+       "  'key': 'C Major'},\n",
+       " {'label': 'solo_1', 'chords': ['D'], 'key': 'D Major'},\n",
+       " {'label': 'chorus_2',\n",
+       "  'chords': ['G',\n",
+       "   'D',\n",
+       "   'G',\n",
+       "   'D',\n",
+       "   'A',\n",
+       "   'D',\n",
+       "   'G',\n",
+       "   'D',\n",
+       "   'Fs7',\n",
+       "   'Bmin',\n",
+       "   'D',\n",
+       "   'G',\n",
+       "   'A7',\n",
+       "   'D',\n",
+       "   'G',\n",
+       "   'A7',\n",
+       "   'D'],\n",
+       "  'key': 'D Major'}]"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "analyzed_records"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "64760bf8-2949-4eee-b23d-d87b1b52ec4d",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'label': 'intro_1',\n",
+       "  'chords': ['C'],\n",
+       "  'key': 'C Major',\n",
+       "  'key_source': 'borrowed_next'},\n",
+       " {'label': 'verse_1',\n",
+       "  'chords': ['F',\n",
+       "   'C',\n",
+       "   'E7',\n",
+       "   'Amin',\n",
+       "   'C',\n",
+       "   'F',\n",
+       "   'C',\n",
+       "   'G7',\n",
+       "   'C',\n",
+       "   'F',\n",
+       "   'C',\n",
+       "   'E7',\n",
+       "   'Amin',\n",
+       "   'C',\n",
+       "   'F',\n",
+       "   'G7',\n",
+       "   'C'],\n",
+       "  'key': 'C Major'},\n",
+       " {'label': 'verse_2',\n",
+       "  'chords': ['F',\n",
+       "   'C',\n",
+       "   'E7',\n",
+       "   'Amin',\n",
+       "   'C',\n",
+       "   'F',\n",
+       "   'C',\n",
+       "   'G7',\n",
+       "   'C',\n",
+       "   'F',\n",
+       "   'C',\n",
+       "   'E7',\n",
+       "   'Amin',\n",
+       "   'C',\n",
+       "   'F',\n",
+       "   'G7',\n",
+       "   'C'],\n",
+       "  'key': 'C Major'},\n",
+       " {'label': 'chorus_1',\n",
+       "  'chords': ['F',\n",
+       "   'C',\n",
+       "   'F',\n",
+       "   'C',\n",
+       "   'G',\n",
+       "   'C',\n",
+       "   'F',\n",
+       "   'C',\n",
+       "   'E7',\n",
+       "   'Amin',\n",
+       "   'C',\n",
+       "   'F',\n",
+       "   'G7',\n",
+       "   'C'],\n",
+       "  'key': 'C Major'},\n",
+       " {'label': 'solo_1',\n",
+       "  'chords': ['D'],\n",
+       "  'key': 'D Major',\n",
+       "  'key_source': 'borrowed_next'},\n",
+       " {'label': 'chorus_2',\n",
+       "  'chords': ['G',\n",
+       "   'D',\n",
+       "   'G',\n",
+       "   'D',\n",
+       "   'A',\n",
+       "   'D',\n",
+       "   'G',\n",
+       "   'D',\n",
+       "   'Fs7',\n",
+       "   'Bmin',\n",
+       "   'D',\n",
+       "   'G',\n",
+       "   'A7',\n",
+       "   'D',\n",
+       "   'G',\n",
+       "   'A7',\n",
+       "   'D'],\n",
+       "  'key': 'D Major'}]"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "analyzed_records = patch_short_segments(analyzed_records, min_chords=2)\n",
+    "analyzed_records"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "id": "12151628-8b0f-4206-bf37-630e40f87d57",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chords_output = []\n",
+    "for record in analyzed_records:\n",
+    "    roman = convert_to_roman(record['chords'], record['key'])\n",
+    "\n",
+    "    chords_output.append({\n",
+    "        'section': record['label'],\n",
+    "        'key': record['key'],\n",
+    "        'roman': roman\n",
+    "    })"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "id": "c5413c18-a6a0-4daa-a607-7aa607c43010",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'section': 'intro_1', 'key': 'C Major', 'roman': ['I']},\n",
+       " {'section': 'verse_1',\n",
+       "  'key': 'C Major',\n",
+       "  'roman': ['IV',\n",
+       "   'I',\n",
+       "   'III7',\n",
+       "   'vi',\n",
+       "   'I',\n",
+       "   'IV',\n",
+       "   'I',\n",
+       "   'V7',\n",
+       "   'I',\n",
+       "   'IV',\n",
+       "   'I',\n",
+       "   'III7',\n",
+       "   'vi',\n",
+       "   'I',\n",
+       "   'IV',\n",
+       "   'V7',\n",
+       "   'I']},\n",
+       " {'section': 'verse_2',\n",
+       "  'key': 'C Major',\n",
+       "  'roman': ['IV',\n",
+       "   'I',\n",
+       "   'III7',\n",
+       "   'vi',\n",
+       "   'I',\n",
+       "   'IV',\n",
+       "   'I',\n",
+       "   'V7',\n",
+       "   'I',\n",
+       "   'IV',\n",
+       "   'I',\n",
+       "   'III7',\n",
+       "   'vi',\n",
+       "   'I',\n",
+       "   'IV',\n",
+       "   'V7',\n",
+       "   'I']},\n",
+       " {'section': 'chorus_1',\n",
+       "  'key': 'C Major',\n",
+       "  'roman': ['IV',\n",
+       "   'I',\n",
+       "   'IV',\n",
+       "   'I',\n",
+       "   'V',\n",
+       "   'I',\n",
+       "   'IV',\n",
+       "   'I',\n",
+       "   'III7',\n",
+       "   'vi',\n",
+       "   'I',\n",
+       "   'IV',\n",
+       "   'V7',\n",
+       "   'I']},\n",
+       " {'section': 'solo_1', 'key': 'D Major', 'roman': ['I']},\n",
+       " {'section': 'chorus_2',\n",
+       "  'key': 'D Major',\n",
+       "  'roman': ['IV',\n",
+       "   'I',\n",
+       "   'IV',\n",
+       "   'I',\n",
+       "   'V',\n",
+       "   'I',\n",
+       "   'IV',\n",
+       "   'I',\n",
+       "   'III7',\n",
+       "   'vi',\n",
+       "   'I',\n",
+       "   'IV',\n",
+       "   'V7',\n",
+       "   'I',\n",
+       "   'IV',\n",
+       "   'V7',\n",
+       "   'I']}]"
+      ]
+     },
+     "execution_count": 40,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "chords_output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "id": "96efd397-4835-41db-9692-73a86d371f2d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def extract_and_format_roman(analyzed_records, include_tags=True):\n",
+    "    \"\"\"\n",
+    "    Parameters:\n",
+    "    analyzed_records: chords_output in the former step\n",
+    "    include_tags: \n",
+    "                  - True: Keep \"<intro> I IV <verse> I V\" (Structure + Sequence)\n",
+    "                  - False: Only \"I IV I V\" (Sequence)\n",
+    "    \n",
+    "    Return:\n",
+    "    str: Roman numerals string\n",
+    "    \"\"\"\n",
+    "    output_parts = []\n",
+    "    \n",
+    "    for record in analyzed_records:\n",
+    "        roman_seq = \" \".join(record['roman'])\n",
+    "        \n",
+    "        if include_tags:\n",
+    "            # \"<label> roman_seq\"\n",
+    "            segment_str = f\"<{record['section']}> {roman_seq}\"\n",
+    "            output_parts.append(segment_str)\n",
+    "        else:\n",
+    "            # Roman numerals\n",
+    "            output_parts.append(roman_seq)\n",
+    "            \n",
+    "    return \" \".join(output_parts)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "d52c8fe2-9c51-4379-9705-dc3352e6128d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "final_string_tagged = extract_and_format_roman(chords_output, include_tags=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "id": "cb3754f8-a2e0-4472-9fa3-4025e6c33127",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "I IV I III7 vi I IV I V7 I IV I III7 vi I IV V7 I IV I III7 vi I IV I V7 I IV I III7 vi I IV V7 I IV I IV I V I IV I III7 vi I IV V7 I I IV I IV I V I IV I III7 vi I IV V7 I IV V7 I\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(extract_and_format_roman(chords_output, include_tags=False))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "id": "abb1c572-56d5-4454-a218-140767a3b79f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<intro_1> I <verse_1> IV I III7 vi I IV I V7 I IV I III7 vi I IV V7 I <verse_2> IV I III7 vi I IV I V7 I IV I III7 vi I IV V7 I <chorus_1> IV I IV I V I IV I III7 vi I IV V7 I <solo_1> I <chorus_2> IV I IV I V I IV I III7 vi I IV V7 I IV V7 I\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(extract_and_format_roman(chords_output, include_tags=True))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6d084835-4b26-4921-b12d-6c16a978b7c1",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From 9a92db5f914d3451243653d61e046cc8ab528d5b Mon Sep 17 00:00:00 2001
From: xuanchen-liu-97 <xuanchen.liu.97@gmail.com>
Date: Tue, 18 Nov 2025 18:17:29 +0000
Subject: [PATCH 02/14] update data cleaning

Now the file includes a systematic analysis.
But still haven't used for every song in the dataframe.
Also includes a dictionary for search the mapping between chord-scales. (See the last part)
---
 TDL-chords-data-cleaning.ipynb | 969 ++++++++++++++++++++++-----------
 1 file changed, 639 insertions(+), 330 deletions(-)

diff --git a/TDL-chords-data-cleaning.ipynb b/TDL-chords-data-cleaning.ipynb
index 43b7fc87..78a60266 100644
--- a/TDL-chords-data-cleaning.ipynb
+++ b/TDL-chords-data-cleaning.ipynb
@@ -2,11 +2,12 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 1,
    "id": "cc404196-bd66-43b2-8f3b-9602f6ccf58f",
    "metadata": {},
    "outputs": [],
    "source": [
+    "import numpy as np\n",
     "import pandas as pd\n",
     "import re\n",
     "from collections import defaultdict"
@@ -33,7 +34,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "C:\\Users\\liu.xuanc\\AppData\\Local\\Temp\\ipykernel_17284\\2761146993.py:1: DtypeWarning: Columns (2,3,5,6,7,8,9) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+      "C:\\Users\\liu.xuanc\\AppData\\Local\\Temp\\ipykernel_11624\\2761146993.py:1: DtypeWarning: Columns (2,3,5,6,7,8,9) have mixed types. Specify dtype option on import or set low_memory=False.\n",
       "  df = pd.read_csv(\"hf://datasets/ailsntua/Chordonomicon/chordonomicon_v2.csv\")\n"
      ]
     }
@@ -44,36 +45,55 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
-   "id": "c37fa55f-b1b2-42f1-b827-dfe8dc5fb04a",
+   "execution_count": null,
+   "id": "2ea0931b-cc27-4715-9773-ec765838c3e6",
    "metadata": {},
    "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "72ba2952-93f9-4961-a67f-159a27cdd9e2",
+   "metadata": {},
    "source": [
-    "# def clean_chords(chord_string):\n",
-    "#     \"\"\"\n",
-    "#     Clean chord strings, remove structural labels (such as<intro_1>,<verse_1>, etc.),，\n",
-    "    \n",
-    "#     Parameters:\n",
-    "#         Chord_string: A string containing labels and chords\n",
-    "        \n",
-    "#     return:\n",
-    "#         Chords progression: List\n",
-    "#     \"\"\"\n",
-    "#     if pd.isna(chord_string) or chord_string == '':\n",
-    "#         return []\n",
-    "    \n",
-    "#     # delete <...> lable\n",
-    "#     cleaned = re.sub(r'<[^>]+>', '', chord_string)\n",
-    "    \n",
-    "#     # delete other string\n",
-    "#     chords = [chord.strip() for chord in cleaned.split() if chord.strip()]\n",
-    "    \n",
-    "#     return chords"
+    "First step analysis of chords"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 3,
+   "id": "48ff08d8-f8d2-472e-94c2-4f2b0fe4b2c9",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0         <intro_1> C <verse_1> F C E7 Amin C F C G7 C F...\n",
+       "1         <intro_1> E D A/Cs E D A/Cs <verse_1> E D A/Cs...\n",
+       "2         <intro_1> Csmin <verse_1> A Csmin A Csmin A Cs...\n",
+       "3         <intro_1> D Dmaj7 D Dmaj7 <verse_1> Emin A D G...\n",
+       "4         <intro_1> C <verse_1> G C G C <chorus_1> F Dmi...\n",
+       "                                ...                        \n",
+       "679802    D G D A G D A D G D A G D A D G D A G D A D G ...\n",
+       "679803    G Gadd13 G7 Gadd13 G Emin A7 Emin A7 Emin A7 E...\n",
+       "679804    E Fs E Fs E Fs E Fs E Fs E Fs B Cs Fs B Cs Fs ...\n",
+       "679805    E Csmin Fsmin B E Csmin Fsmin B E Csmin Fsmin ...\n",
+       "679806    A B7 E7 A Fs7 A E7 A D A D B7 A B7 E7 A Fs7 A ...\n",
+       "Name: chords, Length: 679807, dtype: object"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df['chords']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
    "id": "13de4c2d-7941-4f47-bb9a-4c5e2774eeb2",
    "metadata": {},
    "outputs": [],
@@ -87,6 +107,249 @@
     "}"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "4dbf80f4-1144-4c47-bfdc-351b05aebdda",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def analyze_pandas_series(series_data, top_n=50):\n",
+    "    print(\"Removing labels...\")\n",
+    "    clean_series = series_data.str.replace(r'<[^>]+>', ' ', regex=True)\n",
+    "\n",
+    "    print(\"Expliciting and counting...\")\n",
+    "    # 1. .str.split(): to list ['C', 'F', 'G']\n",
+    "    # 2. .explode(): turns lists to one list\n",
+    "    # 3. .value_counts(): count\n",
+    "    chord_counts = clean_series.str.split().explode().value_counts()\n",
+    "    \n",
+    "    print(f\"=== Finish ===\")\n",
+    "    print(f\"Chords amount: {chord_counts.sum()}\")\n",
+    "    print(f\"Chords type amount: {len(chord_counts)}\")\n",
+    "    print(\"-\" * 30)\n",
+    "    \n",
+    "    print(f\"Top {top_n} commonly used chords:\")\n",
+    "    print(chord_counts.head(top_n))\n",
+    "\n",
+    "    print(\"-\" * 30)\n",
+    "    print(\"Examples of long tail data (with fewer occurrences):\")\n",
+    "    print(chord_counts.tail(20).index.tolist())\n",
+    "    \n",
+    "    return chord_counts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "632ac7d5-4936-483b-be8b-d327dcc133ad",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dealing with slash chords...\n"
+     ]
+    }
+   ],
+   "source": [
+    "def simplify_slash_chord(text):\n",
+    "    if not isinstance(text, str): return \"\"\n",
+    "    \n",
+    "    chords = text.split()\n",
+    "    simplified = [c.split('/')[0] for c in chords]\n",
+    "    return \" \".join(simplified)\n",
+    "\n",
+    "print(\"Dealing with slash chords...\")\n",
+    "df['chords_simplified'] = df['chords'].apply(simplify_slash_chord)\n",
+    "print(\"Finish!\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "4c3336c4-e86e-4765-8c46-cb7dc3b733b0",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Removing labels...\n",
+      "Expliciting and counting...\n",
+      "=== Finish ===\n",
+      "Chords amount: 51994634\n",
+      "Chords type amount: 749\n",
+      "------------------------------\n",
+      "Top 50 commonly used chords:\n",
+      "chords_simplified\n",
+      "G        7161058\n",
+      "C        5945092\n",
+      "D        5413274\n",
+      "A        3952888\n",
+      "F        3389952\n",
+      "Amin     2958979\n",
+      "E        2719857\n",
+      "Emin     2662799\n",
+      "Bmin     1446212\n",
+      "B        1384402\n",
+      "Dmin     1265370\n",
+      "Bb       1033272\n",
+      "Fsmin     887691\n",
+      "Fs        692455\n",
+      "Csmin     613688\n",
+      "Gmin      556347\n",
+      "Eb        508770\n",
+      "Cmin      409476\n",
+      "Ab        365022\n",
+      "Cs        328877\n",
+      "Gsmin     324517\n",
+      "A7        302780\n",
+      "Fmin      294103\n",
+      "D7        288826\n",
+      "Amin7     287833\n",
+      "Emin7     284727\n",
+      "E7        271749\n",
+      "Gs        267858\n",
+      "B7        253029\n",
+      "G7        248206\n",
+      "Cadd9     235057\n",
+      "Db        194678\n",
+      "As        166455\n",
+      "Bmin7     164123\n",
+      "Fmaj7     162679\n",
+      "Dmin7     158330\n",
+      "Cmaj7     155171\n",
+      "Ds        148442\n",
+      "C7        147180\n",
+      "Bbmin     141925\n",
+      "Dsmin     115335\n",
+      "Ano3d     105842\n",
+      "Dsus4     103025\n",
+      "Dno3d     102338\n",
+      "Gb         99587\n",
+      "Gno3d      98220\n",
+      "Eno3d      90866\n",
+      "Dsus2      89184\n",
+      "Dmaj7      88411\n",
+      "Asus4      88285\n",
+      "Name: count, dtype: int64\n",
+      "------------------------------\n",
+      "Examples of long tail data (with fewer occurrences):\n",
+      "['Cs13b9', 'Edimb7', 'E11s', 'Dbmaj11', 'Bminmaj9', 'Gs13b9', 'sC', 'Fsminmaj13', 'Fsaugmaj11', 'Bbdim13b9', 'Eminmaj13', 'A11s', 'Abdim9', 'Eb11b9', 'Fsdim11', 'Bbdimadd13', 'Gdim11b9', 'Dsdim9', 'Bdim11b9', 'Fminmaj13']\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Analyze simplified chords\n",
+    "counts = analyze_pandas_series(df['chords_simplified'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "168d4085-71a8-43b0-8492-0d12ac4b32e9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Top 50 coverage ratie: 0.9458\n",
+      "Number of long tail chords type: 699\n",
+      "\n",
+      " Long tail data example (Rank 51-70):\n",
+      "chords_simplified\n",
+      "Fsmin7    83258\n",
+      "Asus2     80833\n",
+      "Gmin7     79634\n",
+      "Gmaj7     79039\n",
+      "Ebmin     78919\n",
+      "Csmin7    72221\n",
+      "Cno3d     72148\n",
+      "Bno3d     71021\n",
+      "F7        66986\n",
+      "Asmin     64547\n",
+      "Amaj7     60432\n",
+      "Gadd13    59584\n",
+      "Fs7       59092\n",
+      "Cmin7     58856\n",
+      "Gsus4     46604\n",
+      "Esus4     46589\n",
+      "Fno3d     44630\n",
+      "Abmin     39087\n",
+      "Fmin7     38177\n",
+      "Fsno3d    36772\n",
+      "Name: count, dtype: int64\n",
+      "\n",
+      " Rare chords example:\n",
+      "chords_simplified\n",
+      "Cs13b9        1\n",
+      "Edimb7        1\n",
+      "E11s          1\n",
+      "Dbmaj11       1\n",
+      "Bminmaj9      1\n",
+      "Gs13b9        1\n",
+      "sC            1\n",
+      "Fsminmaj13    1\n",
+      "Fsaugmaj11    1\n",
+      "Bbdim13b9     1\n",
+      "Eminmaj13     1\n",
+      "A11s          1\n",
+      "Abdim9        1\n",
+      "Eb11b9        1\n",
+      "Fsdim11       1\n",
+      "Bbdimadd13    1\n",
+      "Gdim11b9      1\n",
+      "Dsdim9        1\n",
+      "Bdim11b9      1\n",
+      "Fminmaj13     1\n",
+      "Name: count, dtype: int64\n"
+     ]
+    }
+   ],
+   "source": [
+    "## Check the long tail data\n",
+    "total_count = counts.sum()\n",
+    "top_50_count = counts.head(50).sum()\n",
+    "\n",
+    "top_50_chords = set(counts.head(50).index)\n",
+    "\n",
+    "# 5% long tail data\n",
+    "tail_chords = [c for c in counts.index if c not in top_50_chords]\n",
+    "\n",
+    "print(f\"Top 50 coverage ratie: {top_50_count / total_count:.4f}\")\n",
+    "print(f\"Number of long tail chords type: {len(tail_chords)}\")\n",
+    "\n",
+    "print(\"\\n Long tail data example (Rank 51-70):\")\n",
+    "print(counts.iloc[50:70])\n",
+    "\n",
+    "print(\"\\n Rare chords example:\")\n",
+    "print(counts.tail(20))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "862eff6d-8e41-4574-b6ff-7e49590403e1",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a6c82e59-af50-47c0-8e69-1b84fae2728b",
+   "metadata": {},
+   "source": [
+    "Clean chords data:\n",
+    "1. Seperate one song into several segments;\n",
+    "2. Infer the key of each segment;\n",
+    "3. Transpose chords into Roman numerals."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 11,
@@ -239,11 +502,78 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 20,
+   "id": "d16ad9ab-ebe1-4cfc-86f5-0c38ece01323",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ==========================================\n",
+    "# Transpose the chords to Roman numerals\n",
+    "# ==========================================\n",
+    "\n",
+    "ROMAN_MAP = {\n",
+    "    0: 'I', 1: 'bII', 2: 'II', 3: 'bIII', 4: 'III', 5: 'IV', \n",
+    "    6: 'bV', 7: 'V', 8: 'bVI', 9: 'VI', 10: 'bVII', 11: 'VII'\n",
+    "}\n",
+    "\n",
+    "def convert_to_roman(chord_list, key_str):\n",
+    "    \"\"\"\n",
+    "    input: chord_list=['C', 'F', 'G'], key_str='C Major'\n",
+    "    output: ['I', 'IV', 'V']\n",
+    "    \"\"\"\n",
+    "    if key_str == \"Unknown\":\n",
+    "        return chord_list\n",
+    "    \n",
+    "    # Main tone\n",
+    "    key_root_str = key_str.split()[0] # \"C Major\" -> \"C\"\n",
+    "    is_minor_key = \"Minor\" in key_str\n",
+    "    \n",
+    "    if key_root_str not in NOTES: return chord_list\n",
+    "    key_root_idx = NOTES.index(key_root_str)\n",
+    "    \n",
+    "    roman_output = []\n",
+    "    \n",
+    "    for chord_str in chord_list:\n",
+    "        root, quality = parse_chord_root_quality(chord_str)\n",
+    "        if not root:\n",
+    "            roman_output.append(\"?\")\n",
+    "            continue\n",
+    "            \n",
+    "        # Calculate interval  (Root note of chord - tonic) % 12\n",
+    "        root_idx = NOTES.index(root)\n",
+    "        interval = (root_idx - key_root_idx) % 12\n",
+    "        \n",
+    "        base_roman = ROMAN_MAP.get(interval, \"?\")\n",
+    "        \n",
+    "        # min or dim -> lowercast\n",
+    "        if quality == 'min' or quality == 'dim':\n",
+    "            final_roman = base_roman.lower()\n",
+    "        else:\n",
+    "            final_roman = base_roman\n",
+    "            \n",
+    "            \n",
+    "        # suffix\n",
+    "        if quality == '7':\n",
+    "            final_roman += '7'\n",
+    "        elif quality == 'dim':\n",
+    "            final_roman += '°'\n",
+    "            \n",
+    "        roman_output.append(final_roman)\n",
+    "        \n",
+    "    return roman_output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
    "id": "681f297a-1520-4f5f-bf33-7a9af7b5ecf8",
    "metadata": {},
    "outputs": [],
    "source": [
+    "# ==========================================\n",
+    "# Fix short chords progression\n",
+    "# ==========================================\n",
+    "\n",
     "def patch_short_segments(analyzed_segments, min_chords=2):\n",
     "    \"\"\"\n",
     "    Parameters:\n",
@@ -279,78 +609,54 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
-   "id": "795661aa-e36a-40b3-8d4e-6bdd5e216566",
+   "execution_count": 21,
+   "id": "96efd397-4835-41db-9692-73a86d371f2d",
    "metadata": {},
    "outputs": [],
    "source": [
     "# ==========================================\n",
-    "# Roman Numeral Conversion\n",
+    "# Combine Roman numerals\n",
     "# ==========================================\n",
-    "ROMAN_MAP = {\n",
-    "    0: 'I', 1: 'bII', 2: 'II', 3: 'bIII', 4: 'III', 5: 'IV', \n",
-    "    6: 'bV', 7: 'V', 8: 'bVI', 9: 'VI', 10: 'bVII', 11: 'VII'\n",
-    "}\n",
     "\n",
-    "def convert_to_roman(chord_list, key_str):\n",
-    "    \"\"\"\n",
-    "    input: chord_list=['C', 'F', 'G'], key_str='C Major'\n",
-    "    output: ['I', 'IV', 'V']\n",
+    "def extract_and_format_roman(analyzed_records, include_tags=True):\n",
     "    \"\"\"\n",
-    "    if key_str == \"Unknown\":\n",
-    "        return chord_list\n",
-    "    \n",
-    "    # Analyze the main tone of tonality\n",
-    "    key_root_str = key_str.split()[0] # \"C Major\" -> \"C\"\n",
-    "    is_minor_key = \"Minor\" in key_str\n",
-    "    \n",
-    "    if key_root_str not in NOTES: return chord_list\n",
-    "    key_root_idx = NOTES.index(key_root_str)\n",
+    "    Parameters:\n",
+    "    analyzed_records: chords_output in the former step\n",
+    "    include_tags: \n",
+    "                  - True: Keep \"<intro> I IV <verse> I V\" (Structure + Sequence)\n",
+    "                  - False: Only \"I IV I V\" (Sequence)\n",
     "    \n",
-    "    roman_output = []\n",
+    "    Return:\n",
+    "    str: Roman numerals string\n",
+    "    \"\"\"\n",
+    "    output_parts = []\n",
     "    \n",
-    "    for chord_str in chord_list:\n",
-    "        root, quality = parse_chord_root_quality(chord_str)\n",
-    "        if not root:\n",
-    "            roman_output.append(\"?\")\n",
-    "            continue\n",
-    "            \n",
-    "        # Calculate Interval\n",
-    "        # (Root note of chord - tonic tonic) % 12\n",
-    "        root_idx = NOTES.index(root)\n",
-    "        interval = (root_idx - key_root_idx) % 12\n",
-    "        \n",
-    "        base_roman = ROMAN_MAP.get(interval, \"?\")\n",
+    "    for record in analyzed_records:\n",
+    "        roman_seq = \" \".join(record['roman'])\n",
     "        \n",
-    "        # Minor triad (min) or subtract triad (dim) -> Lowercase\n",
-    "        if quality == 'min' or quality == 'dim':\n",
-    "            final_roman = base_roman.lower()\n",
+    "        if include_tags:\n",
+    "            # \"<label> roman_seq\"\n",
+    "            segment_str = f\"<{record['section']}> {roman_seq}\"\n",
+    "            output_parts.append(segment_str)\n",
     "        else:\n",
-    "            final_roman = base_roman\n",
-    "            \n",
-    "        # Suffix\n",
-    "        if quality == '7':\n",
-    "            final_roman += '7'\n",
-    "        elif quality == 'dim':\n",
-    "            final_roman += '°'\n",
+    "            # Roman numerals\n",
+    "            output_parts.append(roman_seq)\n",
     "            \n",
-    "        roman_output.append(final_roman)\n",
-    "        \n",
-    "    return roman_output"
+    "    return \" \".join(output_parts)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "9866344c-1635-4210-ab52-273b47da3af1",
+   "id": "b4eac0b6-0a72-4a77-81bc-8375ad786819",
    "metadata": {},
    "outputs": [],
    "source": []
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
-   "id": "258f2c66-854d-4eef-9289-8025ac837ad4",
+   "execution_count": 16,
+   "id": "fffce437-c1ca-48a4-853b-b570aadf3de5",
    "metadata": {},
    "outputs": [
     {
@@ -359,7 +665,7 @@
        "'<intro_1> C <verse_1> F C E7 Amin C F C G7 C F C E7 Amin C F G7 C <verse_2> F C E7 Amin C F C G7 C F C E7 Amin C F G7 C <chorus_1> F C F C G C F C E7 Amin C F G7 C <solo_1> D <chorus_2> G D G D A D G D Fs7 Bmin D G A7 D G A7 D'"
       ]
      },
-     "execution_count": 15,
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -370,8 +676,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
-   "id": "be20964f-09f4-4d5c-8376-d2c86b3eb90b",
+   "execution_count": 17,
+   "id": "9e18b5cd-41d7-4c77-a467-8ebcbbc4de6c",
    "metadata": {},
    "outputs": [
     {
@@ -450,12 +756,13 @@
        "   'D']}]"
       ]
      },
-     "execution_count": 16,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
+    "# Segmentation\n",
     "segments = segment_music_string(df['chords'][0])\n",
     "segments"
    ]
@@ -463,31 +770,14 @@
   {
    "cell_type": "code",
    "execution_count": 18,
-   "id": "df725e56-0714-40d6-93e1-8f61b983e85d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "analyzed_records = []\n",
-    "for seg in segments:\n",
-    "    inferred_key = infer_key_from_list(seg['chords'])\n",
-    "    analyzed_records.append({\n",
-    "        'label': seg['label'],\n",
-    "        'chords': seg['chords'],\n",
-    "        'key': inferred_key\n",
-    "    })"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "id": "578b5a47-b5b3-4b6c-a66e-9dc754b0c686",
+   "id": "96fa6a65-c63a-4a40-9d4c-b27d1e4f96ed",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "[{'label': 'intro_1', 'chords': ['C'], 'key': 'C Major'},\n",
-       " {'label': 'verse_1',\n",
+       "[{'section': 'intro_1', 'chords': ['C'], 'key': 'C Major'},\n",
+       " {'section': 'verse_1',\n",
        "  'chords': ['F',\n",
        "   'C',\n",
        "   'E7',\n",
@@ -506,7 +796,7 @@
        "   'G7',\n",
        "   'C'],\n",
        "  'key': 'C Major'},\n",
-       " {'label': 'verse_2',\n",
+       " {'section': 'verse_2',\n",
        "  'chords': ['F',\n",
        "   'C',\n",
        "   'E7',\n",
@@ -525,7 +815,7 @@
        "   'G7',\n",
        "   'C'],\n",
        "  'key': 'C Major'},\n",
-       " {'label': 'chorus_1',\n",
+       " {'section': 'chorus_1',\n",
        "  'chords': ['F',\n",
        "   'C',\n",
        "   'F',\n",
@@ -541,8 +831,8 @@
        "   'G7',\n",
        "   'C'],\n",
        "  'key': 'C Major'},\n",
-       " {'label': 'solo_1', 'chords': ['D'], 'key': 'D Major'},\n",
-       " {'label': 'chorus_2',\n",
+       " {'section': 'solo_1', 'chords': ['D'], 'key': 'D Major'},\n",
+       " {'section': 'chorus_2',\n",
        "  'chords': ['G',\n",
        "   'D',\n",
        "   'G',\n",
@@ -563,19 +853,44 @@
        "  'key': 'D Major'}]"
       ]
      },
-     "execution_count": 19,
+     "execution_count": 18,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
+    "# Infer the key\n",
+    "analyzed_records = []\n",
+    "for seg in segments:\n",
+    "    inferred_key = infer_key_from_list(seg['chords'])\n",
+    "    analyzed_records.append({\n",
+    "        'section': seg['label'],\n",
+    "        'chords': seg['chords'],\n",
+    "        'key': inferred_key\n",
+    "    })\n",
+    "\n",
     "analyzed_records"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
-   "id": "64760bf8-2949-4eee-b23d-d87b1b52ec4d",
+   "execution_count": 25,
+   "id": "ab0473b1-cbc5-4160-88f5-ac02d2de0ed5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Fix short patch\n",
+    "analyzed_records = patch_short_segments(analyzed_records, min_chords=2)\n",
+    "\n",
+    "# Transpost to Roman numerals\n",
+    "for record in analyzed_records:\n",
+    "    record['roman'] = convert_to_roman(record['chords'], record['key'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "e008b3e3-5599-405c-b13b-1e7618bd6464",
    "metadata": {
     "scrolled": true
    },
@@ -583,260 +898,185 @@
     {
      "data": {
       "text/plain": [
-       "[{'label': 'intro_1',\n",
-       "  'chords': ['C'],\n",
-       "  'key': 'C Major',\n",
-       "  'key_source': 'borrowed_next'},\n",
-       " {'label': 'verse_1',\n",
-       "  'chords': ['F',\n",
-       "   'C',\n",
-       "   'E7',\n",
-       "   'Amin',\n",
-       "   'C',\n",
-       "   'F',\n",
-       "   'C',\n",
-       "   'G7',\n",
-       "   'C',\n",
-       "   'F',\n",
-       "   'C',\n",
-       "   'E7',\n",
-       "   'Amin',\n",
-       "   'C',\n",
-       "   'F',\n",
-       "   'G7',\n",
-       "   'C'],\n",
-       "  'key': 'C Major'},\n",
-       " {'label': 'verse_2',\n",
-       "  'chords': ['F',\n",
-       "   'C',\n",
-       "   'E7',\n",
-       "   'Amin',\n",
-       "   'C',\n",
-       "   'F',\n",
-       "   'C',\n",
-       "   'G7',\n",
-       "   'C',\n",
-       "   'F',\n",
-       "   'C',\n",
-       "   'E7',\n",
-       "   'Amin',\n",
-       "   'C',\n",
-       "   'F',\n",
-       "   'G7',\n",
-       "   'C'],\n",
-       "  'key': 'C Major'},\n",
-       " {'label': 'chorus_1',\n",
-       "  'chords': ['F',\n",
-       "   'C',\n",
-       "   'F',\n",
-       "   'C',\n",
-       "   'G',\n",
-       "   'C',\n",
-       "   'F',\n",
-       "   'C',\n",
-       "   'E7',\n",
-       "   'Amin',\n",
-       "   'C',\n",
-       "   'F',\n",
-       "   'G7',\n",
-       "   'C'],\n",
-       "  'key': 'C Major'},\n",
-       " {'label': 'solo_1',\n",
-       "  'chords': ['D'],\n",
-       "  'key': 'D Major',\n",
-       "  'key_source': 'borrowed_next'},\n",
-       " {'label': 'chorus_2',\n",
-       "  'chords': ['G',\n",
-       "   'D',\n",
-       "   'G',\n",
-       "   'D',\n",
-       "   'A',\n",
-       "   'D',\n",
-       "   'G',\n",
-       "   'D',\n",
-       "   'Fs7',\n",
-       "   'Bmin',\n",
-       "   'D',\n",
-       "   'G',\n",
-       "   'A7',\n",
-       "   'D',\n",
-       "   'G',\n",
-       "   'A7',\n",
-       "   'D'],\n",
-       "  'key': 'D Major'}]"
+       "{'section': 'chorus_2',\n",
+       " 'chords': ['G',\n",
+       "  'D',\n",
+       "  'G',\n",
+       "  'D',\n",
+       "  'A',\n",
+       "  'D',\n",
+       "  'G',\n",
+       "  'D',\n",
+       "  'Fs7',\n",
+       "  'Bmin',\n",
+       "  'D',\n",
+       "  'G',\n",
+       "  'A7',\n",
+       "  'D',\n",
+       "  'G',\n",
+       "  'A7',\n",
+       "  'D'],\n",
+       " 'key': 'D Major',\n",
+       " 'roman': ['IV',\n",
+       "  'I',\n",
+       "  'IV',\n",
+       "  'I',\n",
+       "  'V',\n",
+       "  'I',\n",
+       "  'IV',\n",
+       "  'I',\n",
+       "  'III7',\n",
+       "  'vi',\n",
+       "  'I',\n",
+       "  'IV',\n",
+       "  'V7',\n",
+       "  'I',\n",
+       "  'IV',\n",
+       "  'V7',\n",
+       "  'I']}"
       ]
      },
-     "execution_count": 20,
+     "execution_count": 26,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "analyzed_records = patch_short_segments(analyzed_records, min_chords=2)\n",
-    "analyzed_records"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 39,
-   "id": "12151628-8b0f-4206-bf37-630e40f87d57",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "chords_output = []\n",
-    "for record in analyzed_records:\n",
-    "    roman = convert_to_roman(record['chords'], record['key'])\n",
-    "\n",
-    "    chords_output.append({\n",
-    "        'section': record['label'],\n",
-    "        'key': record['key'],\n",
-    "        'roman': roman\n",
-    "    })"
+    "record"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
-   "id": "c5413c18-a6a0-4daa-a607-7aa607c43010",
+   "execution_count": 27,
+   "id": "b6deceeb-6964-46b0-ab27-45b6ca58192f",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "[{'section': 'intro_1', 'key': 'C Major', 'roman': ['I']},\n",
-       " {'section': 'verse_1',\n",
-       "  'key': 'C Major',\n",
-       "  'roman': ['IV',\n",
-       "   'I',\n",
-       "   'III7',\n",
-       "   'vi',\n",
-       "   'I',\n",
-       "   'IV',\n",
-       "   'I',\n",
-       "   'V7',\n",
-       "   'I',\n",
-       "   'IV',\n",
-       "   'I',\n",
-       "   'III7',\n",
-       "   'vi',\n",
-       "   'I',\n",
-       "   'IV',\n",
-       "   'V7',\n",
-       "   'I']},\n",
-       " {'section': 'verse_2',\n",
-       "  'key': 'C Major',\n",
-       "  'roman': ['IV',\n",
-       "   'I',\n",
-       "   'III7',\n",
-       "   'vi',\n",
-       "   'I',\n",
-       "   'IV',\n",
-       "   'I',\n",
-       "   'V7',\n",
-       "   'I',\n",
-       "   'IV',\n",
-       "   'I',\n",
-       "   'III7',\n",
-       "   'vi',\n",
-       "   'I',\n",
-       "   'IV',\n",
-       "   'V7',\n",
-       "   'I']},\n",
-       " {'section': 'chorus_1',\n",
-       "  'key': 'C Major',\n",
-       "  'roman': ['IV',\n",
-       "   'I',\n",
-       "   'IV',\n",
-       "   'I',\n",
-       "   'V',\n",
-       "   'I',\n",
-       "   'IV',\n",
-       "   'I',\n",
-       "   'III7',\n",
-       "   'vi',\n",
-       "   'I',\n",
-       "   'IV',\n",
-       "   'V7',\n",
-       "   'I']},\n",
-       " {'section': 'solo_1', 'key': 'D Major', 'roman': ['I']},\n",
-       " {'section': 'chorus_2',\n",
-       "  'key': 'D Major',\n",
-       "  'roman': ['IV',\n",
-       "   'I',\n",
-       "   'IV',\n",
-       "   'I',\n",
-       "   'V',\n",
-       "   'I',\n",
-       "   'IV',\n",
-       "   'I',\n",
-       "   'III7',\n",
-       "   'vi',\n",
-       "   'I',\n",
-       "   'IV',\n",
-       "   'V7',\n",
-       "   'I',\n",
-       "   'IV',\n",
-       "   'V7',\n",
-       "   'I']}]"
+       "'I IV I III7 vi I IV I V7 I IV I III7 vi I IV V7 I IV I III7 vi I IV I V7 I IV I III7 vi I IV V7 I IV I IV I V I IV I III7 vi I IV V7 I I IV I IV I V I IV I III7 vi I IV V7 I IV V7 I'"
       ]
      },
-     "execution_count": 40,
+     "execution_count": 27,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "chords_output"
+    "roman_numerals_record = extract_and_format_roman(analyzed_records, include_tags=False)\n",
+    "roman_numerals_record"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
-   "id": "96efd397-4835-41db-9692-73a86d371f2d",
+   "execution_count": null,
+   "id": "1f51fc7b-718d-4ce6-b35b-c93c8148dab4",
    "metadata": {},
    "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7feb8982-308d-467d-b737-b891517ab37c",
+   "metadata": {},
    "source": [
-    "def extract_and_format_roman(analyzed_records, include_tags=True):\n",
-    "    \"\"\"\n",
-    "    Parameters:\n",
-    "    analyzed_records: chords_output in the former step\n",
-    "    include_tags: \n",
-    "                  - True: Keep \"<intro> I IV <verse> I V\" (Structure + Sequence)\n",
-    "                  - False: Only \"I IV I V\" (Sequence)\n",
-    "    \n",
-    "    Return:\n",
-    "    str: Roman numerals string\n",
-    "    \"\"\"\n",
-    "    output_parts = []\n",
-    "    \n",
-    "    for record in analyzed_records:\n",
-    "        roman_seq = \" \".join(record['roman'])\n",
+    "Coarse graining for Roman Numerals"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "795661aa-e36a-40b3-8d4e-6bdd5e216566",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class RomanNormalizer:\n",
+    "    def __init__(self, strategy='core'):\n",
+    "        \"\"\"\n",
+    "        strategy:\n",
+    "          - 'core': Keep only I, i, V7, dim\n",
+    "          - 'standard': Add sus, maj7, m7\n",
+    "          - 'dynamic': Keep all details\n",
+    "        \"\"\"\n",
+    "        self.strategy = strategy\n",
+    "\n",
+    "    def normalize(self, raw_roman):\n",
+    "        if not isinstance(raw_roman, str) or raw_roman in ['?', 'Unknown', '<ERROR>']:\n",
+    "            return None\n",
+    "\n",
+    "        # Disassemble (prefix core suffix)\n",
+    "        match = re.match(r'^([b#]*)([ivIV]+)(.*)$', raw_roman)\n",
+    "        if not match: return None\n",
     "        \n",
-    "        if include_tags:\n",
-    "            # \"<label> roman_seq\"\n",
-    "            segment_str = f\"<{record['section']}> {roman_seq}\"\n",
-    "            output_parts.append(segment_str)\n",
-    "        else:\n",
-    "            # Roman numerals\n",
-    "            output_parts.append(roman_seq)\n",
+    "        prefix, core, suffix = match.groups()\n",
+    "        is_lower = core.islower()\n",
+    "\n",
+    "        # Reorganize strings based on strategy\n",
+    "        if self.strategy == 'dynamic':\n",
+    "            return raw_roman\n",
     "            \n",
-    "    return \" \".join(output_parts)"
+    "        elif self.strategy == 'standard':\n",
+    "            new_suffix = \"\"\n",
+    "            if 'sus4' in suffix: new_suffix = 'sus4'\n",
+    "            elif 'sus2' in suffix: new_suffix = 'sus2'\n",
+    "            elif 'maj7' in suffix or 'M7' in suffix: new_suffix = 'maj7'\n",
+    "            elif 'ø' in suffix: new_suffix = 'm7b5'\n",
+    "            elif 'dim7' in suffix: new_suffix = 'dim7'\n",
+    "            elif 'dim' in suffix or '°' in suffix: new_suffix = 'dim'\n",
+    "            elif '7' in suffix: new_suffix = '7' # 保留 V7 或 m7\n",
+    "            return f\"{prefix}{core}{new_suffix}\"\n",
+    "\n",
+    "        elif self.strategy == 'core':\n",
+    "            new_suffix = \"\"\n",
+    "            # Two situations keeping suffix:\n",
+    "            # 1. dominant 7th\n",
+    "            if not is_lower and '7' in suffix and 'maj' not in suffix:\n",
+    "                new_suffix = '7'\n",
+    "            # 2. diminish\n",
+    "            elif 'dim' in suffix or '°' in suffix:\n",
+    "                new_suffix = 'dim'\n",
+    "            # others\n",
+    "            return f\"{prefix}{core}{new_suffix}\"\n",
+    "            \n",
+    "        return raw_roman"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
-   "id": "d52c8fe2-9c51-4379-9705-dc3352e6128d",
+   "execution_count": 29,
+   "id": "99ba921d-087f-4d85-9d94-5ee7e99f8d72",
    "metadata": {},
    "outputs": [],
    "source": [
-    "final_string_tagged = extract_and_format_roman(chords_output, include_tags=False)"
+    "def normalize_sequence(roman_seq_str, normalizer):\n",
+    "    \"\"\"\n",
+    "    input: \"<intro> I bVIsus4 V7\"\n",
+    "    output: \"<intro> I bVI V7\" (core strategy)\n",
+    "    \"\"\"\n",
+    "    if not isinstance(roman_seq_str, str):\n",
+    "        return \"\"\n",
+    "    \n",
+    "    tokens = roman_seq_str.split()\n",
+    "    \n",
+    "    normalized_tokens = []\n",
+    "    for token in tokens:\n",
+    "        # if label (<intro>), keep it\n",
+    "        if token.startswith('<') and token.endswith('>'):\n",
+    "            normalized_tokens.append(token)\n",
+    "        else:\n",
+    "            norm = normalizer.normalize(token)\n",
+    "            if norm:\n",
+    "                normalized_tokens.append(norm)\n",
+    "    \n",
+    "    # append to a string list\n",
+    "    return \" \".join(normalized_tokens)\n",
+    "\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
-   "id": "cb3754f8-a2e0-4472-9fa3-4025e6c33127",
+   "execution_count": 30,
+   "id": "04d0d98e-d7ec-4687-922d-1ea0c253b68d",
    "metadata": {},
    "outputs": [
     {
@@ -848,27 +1088,96 @@
     }
    ],
    "source": [
-    "print(extract_and_format_roman(chords_output, include_tags=False))"
+    "normalizer = RomanNormalizer(strategy='core')\n",
+    "\n",
+    "normalized_roman = normalize_sequence(roman_numerals_record, normalizer)\n",
+    "print(normalized_roman) "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
-   "id": "abb1c572-56d5-4454-a218-140767a3b79f",
+   "execution_count": null,
+   "id": "7ee0d24f-691c-4183-890d-64fe33fb0a7d",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e057a68f-db0d-4753-a800-96ac255d95ca",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "<intro_1> I <verse_1> IV I III7 vi I IV I V7 I IV I III7 vi I IV V7 I <verse_2> IV I III7 vi I IV I V7 I IV I III7 vi I IV V7 I <chorus_1> IV I IV I V I IV I III7 vi I IV V7 I <solo_1> I <chorus_2> IV I IV I V I IV I III7 vi I IV V7 I IV V7 I\n"
-     ]
-    }
-   ],
    "source": [
-    "print(extract_and_format_roman(chords_output, include_tags=True))"
+    "Tool: Roman Numerals to scales (Used as a dictionary)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "9866344c-1635-4210-ab52-273b47da3af1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def map_roman_to_pitches(roman_str):\n",
+    "    \"\"\"\n",
+    "    Roman numerals to music scales [0-11].\n",
+    "    \"\"\"\n",
+    "    if not roman_str: return []\n",
+    "\n",
+    "    ROMAN_OFFSETS = {'i': 0, 'ii': 2, 'iii': 4, 'iv': 5, 'v': 7, 'vi': 9, 'vii': 11}\n",
+    "    \n",
+    "    INTERVALS = {\n",
+    "        'maj': [0, 4, 7], 'min': [0, 3, 7], 'dim': [0, 3, 6],\n",
+    "        'dom7': [0, 4, 7, 10], 'maj7': [0, 4, 7, 11], 'min7': [0, 3, 7, 10],\n",
+    "        'dim7': [0, 3, 6, 9], 'm7b5': [0, 3, 6, 10],\n",
+    "        'sus4': [0, 5, 7], 'sus2': [0, 2, 7]\n",
+    "    }\n",
+    "    \n",
+    "    EXTENSIONS = {'9': 2, '11': 5, '13': 9, 'b9': 1, '#9': 3}\n",
+    "\n",
+    "    # Analyze\n",
+    "    match = re.match(r'^([b#]*)([ivIV]+)(.*)$', roman_str)\n",
+    "    if not match: return []\n",
+    "    prefix, core, suffix = match.groups()\n",
+    "\n",
+    "    # Root\n",
+    "    base = ROMAN_OFFSETS.get(core.lower())\n",
+    "    if base is None: return []\n",
+    "    acc = -1 if prefix=='b' else (1 if prefix=='#' else 0)\n",
+    "    root = (base + acc) % 12\n",
+    "\n",
+    "    # Basic interval\n",
+    "    is_lower = core.islower()\n",
+    "\n",
+    "    intervals = []\n",
+    "    if 'sus4' in suffix: intervals = INTERVALS['sus4']\n",
+    "    elif 'sus2' in suffix: intervals = INTERVALS['sus2']\n",
+    "    elif 'maj7' in suffix: intervals = INTERVALS['maj7']\n",
+    "    elif 'dim7' in suffix: intervals = INTERVALS['dim7']\n",
+    "    elif 'dim' in suffix: intervals = INTERVALS['dim']\n",
+    "    elif '7' in suffix: intervals = INTERVALS['min7'] if is_lower else INTERVALS['dom7']\n",
+    "    else: intervals = INTERVALS['min'] if is_lower else INTERVALS['maj']\n",
+    "    \n",
+    "    current_intervals = list(intervals)\n",
+    "\n",
+    "    # Deal with extentions\n",
+    "    for ext, val in EXTENSIONS.items():\n",
+    "        if ext in suffix and val not in current_intervals:\n",
+    "            current_intervals.append(val)\n",
+    "\n",
+    "    # (Root + Interval) % 12\n",
+    "    pitches = sorted(list(set([(root + i) % 12 for i in current_intervals])))\n",
+    "    \n",
+    "    return pitches"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9e2f171e-8233-4ae5-b30a-7e92c83e6237",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": null,

From 03e956be48f84966918b499dbf3608860a68aa7f Mon Sep 17 00:00:00 2001
From: pierrick <pierrick.lry@gmail.com>
Date: Wed, 19 Nov 2025 13:36:40 +0100
Subject: [PATCH 03/14] dataset intergration first part

---
 configs/dataset/hypergraph/chordonomicon.yaml |  34 ++++++
 topobench/data/datasets/chordonomicon.py      | 115 ++++++++++++++++++
 .../hypergraph/chordonomicon_loader.py        |  26 ++++
 3 files changed, 175 insertions(+)
 create mode 100644 configs/dataset/hypergraph/chordonomicon.yaml
 create mode 100644 topobench/data/datasets/chordonomicon.py
 create mode 100644 topobench/data/loaders/hypergraph/chordonomicon_loader.py

diff --git a/configs/dataset/hypergraph/chordonomicon.yaml b/configs/dataset/hypergraph/chordonomicon.yaml
new file mode 100644
index 00000000..4dc3fa46
--- /dev/null
+++ b/configs/dataset/hypergraph/chordonomicon.yaml
@@ -0,0 +1,34 @@
+# Dataset loader config
+loader:
+  _target_: topobench.data.loaders.ChordonomiconDatasetLoader
+  parameters: 
+    data_domain: hypergraph
+    data_type: chords
+    data_name: chordonomicon
+    data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type}
+
+# Dataset parameters
+parameters:
+  num_features: 1
+  num_classes: 1
+  # num_nodes: 3224
+  task: regression
+  loss_type: mse
+  monitor_metric: mae
+  task_level: edge
+
+#splits
+split_params:
+  learning_setting: transductive
+  data_seed: 0
+  split_type: random #'k-fold' # either "k-fold" or "random" strategies
+  k: 10 # for "k-fold" Cross-Validation
+  train_prop: 0.5 # for "random" strategy splitting
+  standardize: True
+  data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name}
+
+# Dataloader parameters
+dataloader_params:
+  batch_size: 1 # Fixed since transductive
+  num_workers: 0
+  pin_memory: False
diff --git a/topobench/data/datasets/chordonomicon.py b/topobench/data/datasets/chordonomicon.py
new file mode 100644
index 00000000..8d007b74
--- /dev/null
+++ b/topobench/data/datasets/chordonomicon.py
@@ -0,0 +1,115 @@
+"""Dataset class for Chordonomicon dataset."""
+
+import ast
+import os
+import os.path as osp
+
+import numpy as np
+import pandas as pd
+import requests
+import torch
+from torch_geometric.data import Data, InMemoryDataset, extract_zip
+from torch_geometric.io import fs
+
+
+class ChordonomiconDataset(InMemoryDataset):
+    """Dataset class for Chordonomicon dataset.
+
+    Parameters
+    ----------
+    data_dir : str
+        Directory where the dataset will be stored, raw
+        and processed will be subdirectories.
+    data_name : str
+        Name of the dataset (e.g., 'Chordonomicon').
+    """
+
+    URL = "https://huggingface.co/datasets/PierrickLeKing/topobench-music-synergy/resolve/main/dataframe.zip"  # pylint: disable=line-too-long
+    RAW_FILE_NAMES = ["dataframe.csv"]
+
+    def __init__(self, data_dir, data_name):
+        self.name = data_name
+        self.data_dir = data_dir
+        self.folder_chordonomicon = osp.join(self.data_dir, self.name)
+        self.root = osp.join(data_dir, data_name)
+        super().__init__(self.root)
+
+    def download(self) -> None:
+        """Download the Chordonomicon dataset.
+
+        Raises:
+            requests.exceptions.HTTPError: If the download fails.
+        """
+        print("Downloading...")
+        r = requests.get(self.URL, timeout=30)
+        r.raise_for_status()
+        with open(
+            osp.join(self.folder_chordonomicon, "dataframe.zip"), "wb"
+        ) as f:
+            f.write(r.content)
+        extract_zip(
+            osp.join(self.folder_chordonomicon, "dataframe.zip"),
+            osp.join(self.folder_chordonomicon, "raw"),
+        )
+        os.unlink(osp.join(self.folder_chordonomicon, "dataframe.zip"))
+
+    def process(self) -> None:
+        """Handle the Chordonomicon dataset.
+
+        Convert the raw data into a PyTorch Geometric Data object and save it.
+        """
+        df = pd.read_csv(
+            osp.join(self.folder_chordonomicon, "raw", self.RAW_FILE_NAMES[0])
+        )
+        df["chords"] = (
+            df["chords"].apply(ast.literal_eval).apply(list).apply(np.array)
+        )
+        t1 = torch.from_numpy(np.concatenate(df["chords"].values))
+        t2 = torch.tensor(df["chords"].apply(len).values)
+        indices = torch.stack(
+            (t1, torch.repeat_interleave(torch.arange(len(t2)), t2))
+        )
+        incidence_hyperedges = torch.sparse_coo_tensor(
+            indices, torch.ones(indices.shape[1])
+        ).coalesce()
+        x_hyperedges = torch.tensor(df["frequency"].values).unsqueeze(1)
+        y_hyperedges = torch.tensor(df["local_o_info"].values)
+        data = Data(
+            incidence_hyperedges=incidence_hyperedges,
+            num_hyperedges=incidence_hyperedges.size(1),
+            x_hyperedges=x_hyperedges,
+            y_hyperedges=y_hyperedges,
+        )
+        data_list = [data]
+        self.data, self.slices = self.collate(data_list)
+        fs.torch_save(
+            (
+                self._data.to_dict(),
+                self.slices,
+                {},
+                self._data.__class__,
+            ),
+            self.processed_paths[0],
+        )
+
+    @property
+    def raw_file_names(self) -> list[str]:
+        """Return the raw file names for the dataset.
+
+        Returns
+        -------
+        list[str]
+            List of raw file names.
+        """
+        return self.RAW_FILE_NAMES
+
+    @property
+    def processed_file_names(self) -> str:
+        """Return the processed file name for the dataset.
+
+        Returns
+        -------
+        str
+            Processed file name.
+        """
+        return "data.pt"
diff --git a/topobench/data/loaders/hypergraph/chordonomicon_loader.py b/topobench/data/loaders/hypergraph/chordonomicon_loader.py
new file mode 100644
index 00000000..ea7f5691
--- /dev/null
+++ b/topobench/data/loaders/hypergraph/chordonomicon_loader.py
@@ -0,0 +1,26 @@
+"""Loader for Chordonomicon dataset."""
+
+from topobench.data.datasets import ChordonomiconDataset
+from topobench.data.loaders.base import AbstractLoader
+
+
+class ChordonomiconDatasetLoader(AbstractLoader):
+    """Loader class for Chordonomicon dataset.
+
+    Args:
+        - parameters (DictConfig): Loader parameters.
+            - data_dir (str): Root directory where the dataset folder is stored.
+            - data_name (str): Name of the dataset.
+    """
+
+    def load_dataset(self) -> ChordonomiconDataset:
+        """Load the Chordonomicon dataset.
+
+        Returns
+        -------
+        ChordonomiconDataset
+            The loaded Chordonomicon dataset.
+        """
+        return ChordonomiconDataset(
+            data_dir=self.root_data_dir, data_name=self.parameters.data_name
+        )

From 0921b3180e16d078090e3bf6cade163d12209d29 Mon Sep 17 00:00:00 2001
From: pierrick <pierrick.lry@gmail.com>
Date: Wed, 19 Nov 2025 17:22:15 +0100
Subject: [PATCH 04/14] correct none data problem

---
 topobench/data/datasets/chordonomicon.py | 63 +++++++++++++++++-------
 1 file changed, 44 insertions(+), 19 deletions(-)

diff --git a/topobench/data/datasets/chordonomicon.py b/topobench/data/datasets/chordonomicon.py
index 8d007b74..7d321c41 100644
--- a/topobench/data/datasets/chordonomicon.py
+++ b/topobench/data/datasets/chordonomicon.py
@@ -17,30 +17,32 @@ class ChordonomiconDataset(InMemoryDataset):
 
     Parameters
     ----------
-    data_dir : str
+    root : str
         Directory where the dataset will be stored, raw
-        and processed will be subdirectories.
-    data_name : str
+        and processed will be subdirectories of it.
+    name : str
         Name of the dataset (e.g., 'Chordonomicon').
     """
 
     URL = "https://huggingface.co/datasets/PierrickLeKing/topobench-music-synergy/resolve/main/dataframe.zip"  # pylint: disable=line-too-long
-    RAW_FILE_NAMES = ["dataframe.csv"]
 
-    def __init__(self, data_dir, data_name):
-        self.name = data_name
-        self.data_dir = data_dir
-        self.folder_chordonomicon = osp.join(self.data_dir, self.name)
-        self.root = osp.join(data_dir, data_name)
-        super().__init__(self.root)
+    def __init__(self, root, name):
+        self.name = name
+        self.root = root
+        self.folder_chordonomicon = osp.join(self.root, self.name)
+        super().__init__(
+            root,
+        )
+        out = fs.torch_load(self.processed_paths[0])
+        data, self.slices, self.sizes, data_cls = out
+        self.data = data_cls.from_dict(data)
 
-    def download(self) -> None:
+    def download(self):
         """Download the Chordonomicon dataset.
 
         Raises:
             requests.exceptions.HTTPError: If the download fails.
         """
-        print("Downloading...")
         r = requests.get(self.URL, timeout=30)
         r.raise_for_status()
         with open(
@@ -53,13 +55,13 @@ def download(self) -> None:
         )
         os.unlink(osp.join(self.folder_chordonomicon, "dataframe.zip"))
 
-    def process(self) -> None:
+    def process(self):
         """Handle the Chordonomicon dataset.
 
         Convert the raw data into a PyTorch Geometric Data object and save it.
         """
         df = pd.read_csv(
-            osp.join(self.folder_chordonomicon, "raw", self.RAW_FILE_NAMES[0])
+            osp.join(self.folder_chordonomicon, "raw", self.raw_file_names[0])
         )
         df["chords"] = (
             df["chords"].apply(ast.literal_eval).apply(list).apply(np.array)
@@ -80,14 +82,15 @@ def process(self) -> None:
             x_hyperedges=x_hyperedges,
             y_hyperedges=y_hyperedges,
         )
+        print("Balise 3: data created", data)
         data_list = [data]
-        self.data, self.slices = self.collate(data_list)
+        data, slices = self.collate(data_list)
         fs.torch_save(
             (
-                self._data.to_dict(),
-                self.slices,
+                data.to_dict(),
+                slices,
                 {},
-                self._data.__class__,
+                data.__class__,
             ),
             self.processed_paths[0],
         )
@@ -101,7 +104,7 @@ def raw_file_names(self) -> list[str]:
         list[str]
             List of raw file names.
         """
-        return self.RAW_FILE_NAMES
+        return ["dataframe.csv"]
 
     @property
     def processed_file_names(self) -> str:
@@ -113,3 +116,25 @@ def processed_file_names(self) -> str:
             Processed file name.
         """
         return "data.pt"
+
+    @property
+    def raw_dir(self) -> str:
+        """Return the path to the raw directory of the dataset.
+
+        Returns
+        -------
+        str
+            Path to the raw directory.
+        """
+        return osp.join(self.root, self.name, "raw")
+
+    @property
+    def processed_dir(self) -> str:
+        """Return the path to the processed directory of the dataset.
+
+        Returns
+        -------
+        str
+            Path to the processed directory.
+        """
+        return osp.join(self.root, self.name, "processed")

From 16bb74e3dbfc5e29fbddb68f123d2290def7983f Mon Sep 17 00:00:00 2001
From: pierrick <pierrick.lry@gmail.com>
Date: Fri, 21 Nov 2025 12:21:42 +0100
Subject: [PATCH 05/14] slight dataset modifications

---
 configs/dataset/hypergraph/chordonomicon.yaml             | 2 +-
 topobench/data/datasets/chordonomicon.py                  | 3 ++-
 topobench/data/loaders/hypergraph/chordonomicon_loader.py | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/configs/dataset/hypergraph/chordonomicon.yaml b/configs/dataset/hypergraph/chordonomicon.yaml
index 4dc3fa46..7c203e0c 100644
--- a/configs/dataset/hypergraph/chordonomicon.yaml
+++ b/configs/dataset/hypergraph/chordonomicon.yaml
@@ -24,7 +24,7 @@ split_params:
   split_type: random #'k-fold' # either "k-fold" or "random" strategies
   k: 10 # for "k-fold" Cross-Validation
   train_prop: 0.5 # for "random" strategy splitting
-  standardize: True
+  standardize: False
   data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name}
 
 # Dataloader parameters
diff --git a/topobench/data/datasets/chordonomicon.py b/topobench/data/datasets/chordonomicon.py
index 7d321c41..42e41e1e 100644
--- a/topobench/data/datasets/chordonomicon.py
+++ b/topobench/data/datasets/chordonomicon.py
@@ -81,8 +81,9 @@ def process(self):
             num_hyperedges=incidence_hyperedges.size(1),
             x_hyperedges=x_hyperedges,
             y_hyperedges=y_hyperedges,
+            y=y_hyperedges,
+            x=torch.eye(incidence_hyperedges.size(0)),
         )
-        print("Balise 3: data created", data)
         data_list = [data]
         data, slices = self.collate(data_list)
         fs.torch_save(
diff --git a/topobench/data/loaders/hypergraph/chordonomicon_loader.py b/topobench/data/loaders/hypergraph/chordonomicon_loader.py
index ea7f5691..2e04809f 100644
--- a/topobench/data/loaders/hypergraph/chordonomicon_loader.py
+++ b/topobench/data/loaders/hypergraph/chordonomicon_loader.py
@@ -22,5 +22,5 @@ def load_dataset(self) -> ChordonomiconDataset:
             The loaded Chordonomicon dataset.
         """
         return ChordonomiconDataset(
-            data_dir=self.root_data_dir, data_name=self.parameters.data_name
+            root=self.root_data_dir, name=self.parameters.data_name
         )

From aa50f0c6efd850bab5d9dec234e9a2692c746430 Mon Sep 17 00:00:00 2001
From: pierrick <pierrick.lry@gmail.com>
Date: Fri, 21 Nov 2025 12:24:02 +0100
Subject: [PATCH 06/14] bypass restrictive task levels if using the dummy
 readout NoReadOut

---
 topobench/nn/readouts/base.py      | 4 +++-
 topobench/nn/readouts/identical.py | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/topobench/nn/readouts/base.py b/topobench/nn/readouts/base.py
index 6fdd8412..d6704b12 100755
--- a/topobench/nn/readouts/base.py
+++ b/topobench/nn/readouts/base.py
@@ -42,7 +42,9 @@ def __init__(
             if hidden_dim != out_channels or logits_linear_layer
             else torch.nn.Identity()
         )
-        assert task_level in ["graph", "node"], "Invalid task_level"
+        assert task_level in ["graph", "node"] or self.name == "NoReadOut", (
+            "Invalid task_level"
+        )
         self.task_level = task_level
         self.logits_linear_layer = logits_linear_layer
 
diff --git a/topobench/nn/readouts/identical.py b/topobench/nn/readouts/identical.py
index 723eb9ca..10877388 100644
--- a/topobench/nn/readouts/identical.py
+++ b/topobench/nn/readouts/identical.py
@@ -17,6 +17,7 @@ class NoReadOut(AbstractZeroCellReadOut):
     """
 
     def __init__(self, **kwargs):
+        self.name = "NoReadOut"
         super().__init__(**kwargs)
 
     def forward(

From 5739eb3bccea552196a28e8216e788c5e5a0a820 Mon Sep 17 00:00:00 2001
From: xuanchen-liu-97 <xuanchen.liu.97@gmail.com>
Date: Sat, 22 Nov 2025 12:18:55 +0000
Subject: [PATCH 07/14] Add files via upload

---
 TDL-chords-data-cleaning.ipynb | 1088 ++++++++++++++++++++++++++++++--
 1 file changed, 1051 insertions(+), 37 deletions(-)

diff --git a/TDL-chords-data-cleaning.ipynb b/TDL-chords-data-cleaning.ipynb
index 78a60266..5a0060ed 100644
--- a/TDL-chords-data-cleaning.ipynb
+++ b/TDL-chords-data-cleaning.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 36,
    "id": "cc404196-bd66-43b2-8f3b-9602f6ccf58f",
    "metadata": {},
    "outputs": [],
@@ -10,12 +10,14 @@
     "import numpy as np\n",
     "import pandas as pd\n",
     "import re\n",
+    "import pickle\n",
+    "from tqdm import tqdm\n",
     "from collections import defaultdict"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "id": "aeaa2ed4-6d9e-4cd7-90f3-402cd3e92440",
    "metadata": {},
    "outputs": [],
@@ -26,7 +28,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "id": "94d6aced-7399-4917-bd60-a3997d5831af",
    "metadata": {},
    "outputs": [
@@ -34,7 +36,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "C:\\Users\\liu.xuanc\\AppData\\Local\\Temp\\ipykernel_11624\\2761146993.py:1: DtypeWarning: Columns (2,3,5,6,7,8,9) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+      "C:\\Users\\liu.xuanc\\AppData\\Local\\Temp\\ipykernel_28784\\2761146993.py:1: DtypeWarning: Columns (2,3,5,6,7,8,9) have mixed types. Specify dtype option on import or set low_memory=False.\n",
       "  df = pd.read_csv(\"hf://datasets/ailsntua/Chordonomicon/chordonomicon_v2.csv\")\n"
      ]
     }
@@ -61,7 +63,148 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 5,
+   "id": "8594f632-38dd-48b8-905d-89aa12613d9c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>chords</th>\n",
+       "      <th>release_date</th>\n",
+       "      <th>genres</th>\n",
+       "      <th>decade</th>\n",
+       "      <th>rock_genre</th>\n",
+       "      <th>artist_id</th>\n",
+       "      <th>main_genre</th>\n",
+       "      <th>spotify_song_id</th>\n",
+       "      <th>spotify_artist_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>&lt;intro_1&gt; C &lt;verse_1&gt; F C E7 Amin C F C G7 C F...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>'classic country pop'</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>artist_1</td>\n",
+       "      <td>pop</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>4AIEGdwDzPELXYgM5JaEY5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2</td>\n",
+       "      <td>&lt;intro_1&gt; E D A/Cs E D A/Cs &lt;verse_1&gt; E D A/Cs...</td>\n",
+       "      <td>2003-01-01</td>\n",
+       "      <td>'alternative metal' 'alternative rock' 'nu met...</td>\n",
+       "      <td>2000.0</td>\n",
+       "      <td>pop rock</td>\n",
+       "      <td>artist_2</td>\n",
+       "      <td>metal</td>\n",
+       "      <td>2ffJZ2r8HxI5DHcmf3BO6c</td>\n",
+       "      <td>694QW15WkebjcrWgQHzRYF</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>3</td>\n",
+       "      <td>&lt;intro_1&gt; Csmin &lt;verse_1&gt; A Csmin A Csmin A Cs...</td>\n",
+       "      <td>2003-01-01</td>\n",
+       "      <td>'alternative metal' 'canadian rock' 'funk meta...</td>\n",
+       "      <td>2000.0</td>\n",
+       "      <td>canadian rock</td>\n",
+       "      <td>artist_3</td>\n",
+       "      <td>metal</td>\n",
+       "      <td>5KiY8SZEnvCPyIEkFGRR3y</td>\n",
+       "      <td>0niJkG4tKkne3zwr7I8n9n</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>4</td>\n",
+       "      <td>&lt;intro_1&gt; D Dmaj7 D Dmaj7 &lt;verse_1&gt; Emin A D G...</td>\n",
+       "      <td>2022-09-23</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>2020.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>artist_4</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>01TtAcUqyLCRBZq4ZZiQWS</td>\n",
+       "      <td>17BfKBemmMGO5ZAK25wraW</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>5</td>\n",
+       "      <td>&lt;intro_1&gt; C &lt;verse_1&gt; G C G C &lt;chorus_1&gt; F Dmi...</td>\n",
+       "      <td>2023-02-10</td>\n",
+       "      <td>'modern country pop'</td>\n",
+       "      <td>2020.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>artist_5</td>\n",
+       "      <td>pop</td>\n",
+       "      <td>3zUecdrWC3IqrNSjhnoF3G</td>\n",
+       "      <td>4GGfAshSkqoxpZdoaHm7ky</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   id                                             chords release_date  \\\n",
+       "0   1  <intro_1> C <verse_1> F C E7 Amin C F C G7 C F...          NaN   \n",
+       "1   2  <intro_1> E D A/Cs E D A/Cs <verse_1> E D A/Cs...   2003-01-01   \n",
+       "2   3  <intro_1> Csmin <verse_1> A Csmin A Csmin A Cs...   2003-01-01   \n",
+       "3   4  <intro_1> D Dmaj7 D Dmaj7 <verse_1> Emin A D G...   2022-09-23   \n",
+       "4   5  <intro_1> C <verse_1> G C G C <chorus_1> F Dmi...   2023-02-10   \n",
+       "\n",
+       "                                              genres  decade     rock_genre  \\\n",
+       "0                              'classic country pop'     NaN            NaN   \n",
+       "1  'alternative metal' 'alternative rock' 'nu met...  2000.0       pop rock   \n",
+       "2  'alternative metal' 'canadian rock' 'funk meta...  2000.0  canadian rock   \n",
+       "3                                                NaN  2020.0            NaN   \n",
+       "4                               'modern country pop'  2020.0            NaN   \n",
+       "\n",
+       "  artist_id main_genre         spotify_song_id       spotify_artist_id  \n",
+       "0  artist_1        pop                     NaN  4AIEGdwDzPELXYgM5JaEY5  \n",
+       "1  artist_2      metal  2ffJZ2r8HxI5DHcmf3BO6c  694QW15WkebjcrWgQHzRYF  \n",
+       "2  artist_3      metal  5KiY8SZEnvCPyIEkFGRR3y  0niJkG4tKkne3zwr7I8n9n  \n",
+       "3  artist_4        NaN  01TtAcUqyLCRBZq4ZZiQWS  17BfKBemmMGO5ZAK25wraW  \n",
+       "4  artist_5        pop  3zUecdrWC3IqrNSjhnoF3G  4GGfAshSkqoxpZdoaHm7ky  "
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
    "id": "48ff08d8-f8d2-472e-94c2-4f2b0fe4b2c9",
    "metadata": {},
    "outputs": [
@@ -82,7 +225,7 @@
        "Name: chords, Length: 679807, dtype: object"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -93,7 +236,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 6,
    "id": "13de4c2d-7941-4f47-bb9a-4c5e2774eeb2",
    "metadata": {},
    "outputs": [],
@@ -109,7 +252,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "id": "4dbf80f4-1144-4c47-bfdc-351b05aebdda",
    "metadata": {},
    "outputs": [],
@@ -141,7 +284,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 8,
    "id": "632ac7d5-4936-483b-be8b-d327dcc133ad",
    "metadata": {},
    "outputs": [
@@ -149,7 +292,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Dealing with slash chords...\n"
+      "Dealing with slash chords...\n",
+      "Finish!\n"
      ]
     }
    ],
@@ -502,7 +646,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 14,
    "id": "d16ad9ab-ebe1-4cfc-86f5-0c38ece01323",
    "metadata": {},
    "outputs": [],
@@ -565,7 +709,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 15,
    "id": "681f297a-1520-4f5f-bf33-7a9af7b5ecf8",
    "metadata": {},
    "outputs": [],
@@ -609,7 +753,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 16,
    "id": "96efd397-4835-41db-9692-73a86d371f2d",
    "metadata": {},
    "outputs": [],
@@ -655,7 +799,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 17,
    "id": "fffce437-c1ca-48a4-853b-b570aadf3de5",
    "metadata": {},
    "outputs": [
@@ -665,7 +809,7 @@
        "'<intro_1> C <verse_1> F C E7 Amin C F C G7 C F C E7 Amin C F G7 C <verse_2> F C E7 Amin C F C G7 C F C E7 Amin C F G7 C <chorus_1> F C F C G C F C E7 Amin C F G7 C <solo_1> D <chorus_2> G D G D A D G D Fs7 Bmin D G A7 D G A7 D'"
       ]
      },
-     "execution_count": 16,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -676,7 +820,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 18,
    "id": "9e18b5cd-41d7-4c77-a467-8ebcbbc4de6c",
    "metadata": {},
    "outputs": [
@@ -756,7 +900,7 @@
        "   'D']}]"
       ]
      },
-     "execution_count": 17,
+     "execution_count": 18,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -769,7 +913,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 19,
    "id": "96fa6a65-c63a-4a40-9d4c-b27d1e4f96ed",
    "metadata": {},
    "outputs": [
@@ -853,7 +997,7 @@
        "  'key': 'D Major'}]"
       ]
      },
-     "execution_count": 18,
+     "execution_count": 19,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -874,7 +1018,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 20,
    "id": "ab0473b1-cbc5-4160-88f5-ac02d2de0ed5",
    "metadata": {},
    "outputs": [],
@@ -889,7 +1033,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 21,
    "id": "e008b3e3-5599-405c-b13b-1e7618bd6464",
    "metadata": {
     "scrolled": true
@@ -936,7 +1080,7 @@
        "  'I']}"
       ]
      },
-     "execution_count": 26,
+     "execution_count": 21,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -947,7 +1091,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 22,
    "id": "b6deceeb-6964-46b0-ab27-45b6ca58192f",
    "metadata": {},
    "outputs": [
@@ -957,7 +1101,7 @@
        "'I IV I III7 vi I IV I V7 I IV I III7 vi I IV V7 I IV I III7 vi I IV I V7 I IV I III7 vi I IV V7 I IV I IV I V I IV I III7 vi I IV V7 I I IV I IV I V I IV I III7 vi I IV V7 I IV V7 I'"
       ]
      },
-     "execution_count": 27,
+     "execution_count": 22,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -985,7 +1129,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 24,
    "id": "795661aa-e36a-40b3-8d4e-6bdd5e216566",
    "metadata": {},
    "outputs": [],
@@ -1043,7 +1187,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 25,
    "id": "99ba921d-087f-4d85-9d94-5ee7e99f8d72",
    "metadata": {},
    "outputs": [],
@@ -1069,8 +1213,7 @@
     "                normalized_tokens.append(norm)\n",
     "    \n",
     "    # append to a string list\n",
-    "    return \" \".join(normalized_tokens)\n",
-    "\n"
+    "    return \" \".join(normalized_tokens)"
    ]
   },
   {
@@ -1112,16 +1255,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 41,
    "id": "9866344c-1635-4210-ab52-273b47da3af1",
    "metadata": {},
    "outputs": [],
    "source": [
     "def map_roman_to_pitches(roman_str):\n",
-    "    \"\"\"\n",
-    "    Roman numerals to music scales [0-11].\n",
-    "    \"\"\"\n",
-    "    if not roman_str: return []\n",
+    "    \n",
+    "    # 【修正1】如果输入为空，返回 ([], None) 而不是 []\n",
+    "    if not roman_str: \n",
+    "        return [], None\n",
     "\n",
     "    ROMAN_OFFSETS = {'i': 0, 'ii': 2, 'iii': 4, 'iv': 5, 'v': 7, 'vi': 9, 'vii': 11}\n",
     "    \n",
@@ -1136,12 +1279,20 @@
     "\n",
     "    # Analyze\n",
     "    match = re.match(r'^([b#]*)([ivIV]+)(.*)$', roman_str)\n",
-    "    if not match: return []\n",
+    "    \n",
+    "    # 【修正2】如果正则匹配失败，返回 ([], None) 而不是 []\n",
+    "    if not match: \n",
+    "        return [], None\n",
+    "        \n",
     "    prefix, core, suffix = match.groups()\n",
     "\n",
     "    # Root\n",
     "    base = ROMAN_OFFSETS.get(core.lower())\n",
-    "    if base is None: return []\n",
+    "    \n",
+    "    # 【修正3】如果找不到根音，返回 ([], None) 而不是 []\n",
+    "    if base is None: \n",
+    "        return [], None\n",
+    "        \n",
     "    acc = -1 if prefix=='b' else (1 if prefix=='#' else 0)\n",
     "    root = (base + acc) % 12\n",
     "\n",
@@ -1167,23 +1318,886 @@
     "    # (Root + Interval) % 12\n",
     "    pitches = sorted(list(set([(root + i) % 12 for i in current_intervals])))\n",
     "    \n",
-    "    return pitches"
+    "    vector = np.zeros(12, dtype=int)\n",
+    "    vector[pitches] = 1\n",
+    "    \n",
+    "    return pitches, vector"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "9e2f171e-8233-4ae5-b30a-7e92c83e6237",
+   "id": "6c61ddf4-01d6-4ce3-9410-8682be06cb8b",
    "metadata": {},
    "outputs": [],
    "source": []
   },
+  {
+   "cell_type": "markdown",
+   "id": "25339757-c287-442c-8761-ebd4b734d490",
+   "metadata": {},
+   "source": [
+    "Main"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "id": "2df6fdf1-743e-4d4b-8018-2daeb64926de",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tqdm.pandas(desc=\"Processing\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "id": "1e3f91d3-f0e7-48ff-aad7-1f31feef22b2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ==========================================\n",
+    "# Roman Numeral Column\n",
+    "# ==========================================\n",
+    "def generate_roman_seq(raw_chord_str):\n",
+    "    \n",
+    "    if not isinstance(raw_chord_str, str) or not raw_chord_str:\n",
+    "        return \"\"\n",
+    "\n",
+    "    try:\n",
+    "        segments = segment_music_string(raw_chord_str)\n",
+    "        if not segments:\n",
+    "            return \"\"\n",
+    "\n",
+    "        analyzed_records = []\n",
+    "        for seg in segments:\n",
+    "            inferred_key = infer_key_from_list(seg['chords'])\n",
+    "            analyzed_records.append({\n",
+    "                'section': seg['label'],\n",
+    "                'chords': seg['chords'],\n",
+    "                'key': inferred_key\n",
+    "            })\n",
+    "\n",
+    "        analyzed_records = patch_short_segments(analyzed_records, min_chords=2)\n",
+    "\n",
+    "        for record in analyzed_records:\n",
+    "            record['roman'] = convert_to_roman(record['chords'], record['key'])\n",
+    "\n",
+    "        return extract_and_format_roman(analyzed_records, include_tags=True)\n",
+    "\n",
+    "    except Exception as e:\n",
+    "        return \"\"\n",
+    "\n",
+    "# ==========================================\n",
+    "# One-Hot Matrix Column\n",
+    "# ==========================================\n",
+    "def generate_chroma_matrix(roman_seq_str):\n",
+    "\n",
+    "    if not isinstance(roman_seq_str, str) or not roman_seq_str:\n",
+    "        return np.array([])\n",
+    "\n",
+    "    # explicit string\n",
+    "    tokens = roman_seq_str.split()\n",
+    "    \n",
+    "    matrix_rows = []\n",
+    "    \n",
+    "    for token in tokens:\n",
+    "        # label filter\n",
+    "        if token.startswith('<') and token.endswith('>'):\n",
+    "            continue\n",
+    "            \n",
+    "        _, vector = map_roman_to_pitches(token)\n",
+    "        \n",
+    "        # check vector\n",
+    "        if vector is not None:\n",
+    "            matrix_rows.append(vector)\n",
+    "            \n",
+    "    if not matrix_rows:\n",
+    "        return np.array([])\n",
+    "\n",
+    "    return np.vstack(matrix_rows)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "f080f918-35de-4543-9c66-287c94ce3019",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"Step 1: Generating Roman Numeral sequences...\")\n",
+    "df['roman_seq'] = df['chords_simplified'].progress_apply(generate_roman_seq)\n",
+    "\n",
+    "print(\"Finish Step 1!\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "id": "0ce79ed3-9858-48fa-ab7b-97fa478f1d00",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>chords</th>\n",
+       "      <th>release_date</th>\n",
+       "      <th>genres</th>\n",
+       "      <th>decade</th>\n",
+       "      <th>rock_genre</th>\n",
+       "      <th>artist_id</th>\n",
+       "      <th>main_genre</th>\n",
+       "      <th>spotify_song_id</th>\n",
+       "      <th>spotify_artist_id</th>\n",
+       "      <th>chords_simplified</th>\n",
+       "      <th>roman_seq</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>&lt;intro_1&gt; C &lt;verse_1&gt; F C E7 Amin C F C G7 C F...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>'classic country pop'</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>artist_1</td>\n",
+       "      <td>pop</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>4AIEGdwDzPELXYgM5JaEY5</td>\n",
+       "      <td>&lt;intro_1&gt; C &lt;verse_1&gt; F C E7 Amin C F C G7 C F...</td>\n",
+       "      <td>&lt;intro_1&gt; I &lt;verse_1&gt; IV I III7 vi I IV I V7 I...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2</td>\n",
+       "      <td>&lt;intro_1&gt; E D A/Cs E D A/Cs &lt;verse_1&gt; E D A/Cs...</td>\n",
+       "      <td>2003-01-01</td>\n",
+       "      <td>'alternative metal' 'alternative rock' 'nu met...</td>\n",
+       "      <td>2000.0</td>\n",
+       "      <td>pop rock</td>\n",
+       "      <td>artist_2</td>\n",
+       "      <td>metal</td>\n",
+       "      <td>2ffJZ2r8HxI5DHcmf3BO6c</td>\n",
+       "      <td>694QW15WkebjcrWgQHzRYF</td>\n",
+       "      <td>&lt;intro_1&gt; E D A E D A &lt;verse_1&gt; E D A E D A E ...</td>\n",
+       "      <td>&lt;intro_1&gt; V IV I V IV I &lt;verse_1&gt; V IV I V IV ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>3</td>\n",
+       "      <td>&lt;intro_1&gt; Csmin &lt;verse_1&gt; A Csmin A Csmin A Cs...</td>\n",
+       "      <td>2003-01-01</td>\n",
+       "      <td>'alternative metal' 'canadian rock' 'funk meta...</td>\n",
+       "      <td>2000.0</td>\n",
+       "      <td>canadian rock</td>\n",
+       "      <td>artist_3</td>\n",
+       "      <td>metal</td>\n",
+       "      <td>5KiY8SZEnvCPyIEkFGRR3y</td>\n",
+       "      <td>0niJkG4tKkne3zwr7I8n9n</td>\n",
+       "      <td>&lt;intro_1&gt; Csmin &lt;verse_1&gt; A Csmin A Csmin A Cs...</td>\n",
+       "      <td>&lt;intro_1&gt; i &lt;verse_1&gt; bVI i bVI i bVI i bVI bV...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>4</td>\n",
+       "      <td>&lt;intro_1&gt; D Dmaj7 D Dmaj7 &lt;verse_1&gt; Emin A D G...</td>\n",
+       "      <td>2022-09-23</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>2020.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>artist_4</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>01TtAcUqyLCRBZq4ZZiQWS</td>\n",
+       "      <td>17BfKBemmMGO5ZAK25wraW</td>\n",
+       "      <td>&lt;intro_1&gt; D Dmaj7 D Dmaj7 &lt;verse_1&gt; Emin A D G...</td>\n",
+       "      <td>&lt;intro_1&gt; I I I I &lt;verse_1&gt; ii V I IV ii V I I...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>5</td>\n",
+       "      <td>&lt;intro_1&gt; C &lt;verse_1&gt; G C G C &lt;chorus_1&gt; F Dmi...</td>\n",
+       "      <td>2023-02-10</td>\n",
+       "      <td>'modern country pop'</td>\n",
+       "      <td>2020.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>artist_5</td>\n",
+       "      <td>pop</td>\n",
+       "      <td>3zUecdrWC3IqrNSjhnoF3G</td>\n",
+       "      <td>4GGfAshSkqoxpZdoaHm7ky</td>\n",
+       "      <td>&lt;intro_1&gt; C &lt;verse_1&gt; G C G C &lt;chorus_1&gt; F Dmi...</td>\n",
+       "      <td>&lt;intro_1&gt; I &lt;verse_1&gt; V I V I &lt;chorus_1&gt; IV ii...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   id                                             chords release_date  \\\n",
+       "0   1  <intro_1> C <verse_1> F C E7 Amin C F C G7 C F...          NaN   \n",
+       "1   2  <intro_1> E D A/Cs E D A/Cs <verse_1> E D A/Cs...   2003-01-01   \n",
+       "2   3  <intro_1> Csmin <verse_1> A Csmin A Csmin A Cs...   2003-01-01   \n",
+       "3   4  <intro_1> D Dmaj7 D Dmaj7 <verse_1> Emin A D G...   2022-09-23   \n",
+       "4   5  <intro_1> C <verse_1> G C G C <chorus_1> F Dmi...   2023-02-10   \n",
+       "\n",
+       "                                              genres  decade     rock_genre  \\\n",
+       "0                              'classic country pop'     NaN            NaN   \n",
+       "1  'alternative metal' 'alternative rock' 'nu met...  2000.0       pop rock   \n",
+       "2  'alternative metal' 'canadian rock' 'funk meta...  2000.0  canadian rock   \n",
+       "3                                                NaN  2020.0            NaN   \n",
+       "4                               'modern country pop'  2020.0            NaN   \n",
+       "\n",
+       "  artist_id main_genre         spotify_song_id       spotify_artist_id  \\\n",
+       "0  artist_1        pop                     NaN  4AIEGdwDzPELXYgM5JaEY5   \n",
+       "1  artist_2      metal  2ffJZ2r8HxI5DHcmf3BO6c  694QW15WkebjcrWgQHzRYF   \n",
+       "2  artist_3      metal  5KiY8SZEnvCPyIEkFGRR3y  0niJkG4tKkne3zwr7I8n9n   \n",
+       "3  artist_4        NaN  01TtAcUqyLCRBZq4ZZiQWS  17BfKBemmMGO5ZAK25wraW   \n",
+       "4  artist_5        pop  3zUecdrWC3IqrNSjhnoF3G  4GGfAshSkqoxpZdoaHm7ky   \n",
+       "\n",
+       "                                   chords_simplified  \\\n",
+       "0  <intro_1> C <verse_1> F C E7 Amin C F C G7 C F...   \n",
+       "1  <intro_1> E D A E D A <verse_1> E D A E D A E ...   \n",
+       "2  <intro_1> Csmin <verse_1> A Csmin A Csmin A Cs...   \n",
+       "3  <intro_1> D Dmaj7 D Dmaj7 <verse_1> Emin A D G...   \n",
+       "4  <intro_1> C <verse_1> G C G C <chorus_1> F Dmi...   \n",
+       "\n",
+       "                                           roman_seq  \n",
+       "0  <intro_1> I <verse_1> IV I III7 vi I IV I V7 I...  \n",
+       "1  <intro_1> V IV I V IV I <verse_1> V IV I V IV ...  \n",
+       "2  <intro_1> i <verse_1> bVI i bVI i bVI i bVI bV...  \n",
+       "3  <intro_1> I I I I <verse_1> ii V I IV ii V I I...  \n",
+       "4  <intro_1> I <verse_1> V I V I <chorus_1> IV ii...  "
+      ]
+     },
+     "execution_count": 40,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "id": "df151696-e585-475d-9964-a952125f2bd3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Step 2: Generating One-Hot Chroma Matrices...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Processing: 100%|████████████████████████████████████████████████████████████| 679807/679807 [07:58<00:00, 1420.14it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Step 2: Generating One-Hot Chroma Matrices...\")\n",
+    "df['chroma_matrix'] = df['roman_seq'].progress_apply(generate_chroma_matrix)\n",
+    "\n",
+    "print(\"Finish Step 2!\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "id": "f62af199-6dc3-46ec-a9c5-e3649bca484a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>chords</th>\n",
+       "      <th>release_date</th>\n",
+       "      <th>genres</th>\n",
+       "      <th>decade</th>\n",
+       "      <th>rock_genre</th>\n",
+       "      <th>artist_id</th>\n",
+       "      <th>main_genre</th>\n",
+       "      <th>spotify_song_id</th>\n",
+       "      <th>spotify_artist_id</th>\n",
+       "      <th>chords_simplified</th>\n",
+       "      <th>roman_seq</th>\n",
+       "      <th>chroma_matrix</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>&lt;intro_1&gt; C &lt;verse_1&gt; F C E7 Amin C F C G7 C F...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>'classic country pop'</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>artist_1</td>\n",
+       "      <td>pop</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>4AIEGdwDzPELXYgM5JaEY5</td>\n",
+       "      <td>&lt;intro_1&gt; C &lt;verse_1&gt; F C E7 Amin C F C G7 C F...</td>\n",
+       "      <td>&lt;intro_1&gt; I &lt;verse_1&gt; IV I III7 vi I IV I V7 I...</td>\n",
+       "      <td>[[1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0], [1, 0, ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2</td>\n",
+       "      <td>&lt;intro_1&gt; E D A/Cs E D A/Cs &lt;verse_1&gt; E D A/Cs...</td>\n",
+       "      <td>2003-01-01</td>\n",
+       "      <td>'alternative metal' 'alternative rock' 'nu met...</td>\n",
+       "      <td>2000.0</td>\n",
+       "      <td>pop rock</td>\n",
+       "      <td>artist_2</td>\n",
+       "      <td>metal</td>\n",
+       "      <td>2ffJZ2r8HxI5DHcmf3BO6c</td>\n",
+       "      <td>694QW15WkebjcrWgQHzRYF</td>\n",
+       "      <td>&lt;intro_1&gt; E D A E D A &lt;verse_1&gt; E D A E D A E ...</td>\n",
+       "      <td>&lt;intro_1&gt; V IV I V IV I &lt;verse_1&gt; V IV I V IV ...</td>\n",
+       "      <td>[[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1], [1, 0, ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>3</td>\n",
+       "      <td>&lt;intro_1&gt; Csmin &lt;verse_1&gt; A Csmin A Csmin A Cs...</td>\n",
+       "      <td>2003-01-01</td>\n",
+       "      <td>'alternative metal' 'canadian rock' 'funk meta...</td>\n",
+       "      <td>2000.0</td>\n",
+       "      <td>canadian rock</td>\n",
+       "      <td>artist_3</td>\n",
+       "      <td>metal</td>\n",
+       "      <td>5KiY8SZEnvCPyIEkFGRR3y</td>\n",
+       "      <td>0niJkG4tKkne3zwr7I8n9n</td>\n",
+       "      <td>&lt;intro_1&gt; Csmin &lt;verse_1&gt; A Csmin A Csmin A Cs...</td>\n",
+       "      <td>&lt;intro_1&gt; i &lt;verse_1&gt; bVI i bVI i bVI i bVI bV...</td>\n",
+       "      <td>[[1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0], [1, 0, ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>4</td>\n",
+       "      <td>&lt;intro_1&gt; D Dmaj7 D Dmaj7 &lt;verse_1&gt; Emin A D G...</td>\n",
+       "      <td>2022-09-23</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>2020.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>artist_4</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>01TtAcUqyLCRBZq4ZZiQWS</td>\n",
+       "      <td>17BfKBemmMGO5ZAK25wraW</td>\n",
+       "      <td>&lt;intro_1&gt; D Dmaj7 D Dmaj7 &lt;verse_1&gt; Emin A D G...</td>\n",
+       "      <td>&lt;intro_1&gt; I I I I &lt;verse_1&gt; ii V I IV ii V I I...</td>\n",
+       "      <td>[[1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0], [1, 0, ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>5</td>\n",
+       "      <td>&lt;intro_1&gt; C &lt;verse_1&gt; G C G C &lt;chorus_1&gt; F Dmi...</td>\n",
+       "      <td>2023-02-10</td>\n",
+       "      <td>'modern country pop'</td>\n",
+       "      <td>2020.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>artist_5</td>\n",
+       "      <td>pop</td>\n",
+       "      <td>3zUecdrWC3IqrNSjhnoF3G</td>\n",
+       "      <td>4GGfAshSkqoxpZdoaHm7ky</td>\n",
+       "      <td>&lt;intro_1&gt; C &lt;verse_1&gt; G C G C &lt;chorus_1&gt; F Dmi...</td>\n",
+       "      <td>&lt;intro_1&gt; I &lt;verse_1&gt; V I V I &lt;chorus_1&gt; IV ii...</td>\n",
+       "      <td>[[1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0], [0, 0, ...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   id                                             chords release_date  \\\n",
+       "0   1  <intro_1> C <verse_1> F C E7 Amin C F C G7 C F...          NaN   \n",
+       "1   2  <intro_1> E D A/Cs E D A/Cs <verse_1> E D A/Cs...   2003-01-01   \n",
+       "2   3  <intro_1> Csmin <verse_1> A Csmin A Csmin A Cs...   2003-01-01   \n",
+       "3   4  <intro_1> D Dmaj7 D Dmaj7 <verse_1> Emin A D G...   2022-09-23   \n",
+       "4   5  <intro_1> C <verse_1> G C G C <chorus_1> F Dmi...   2023-02-10   \n",
+       "\n",
+       "                                              genres  decade     rock_genre  \\\n",
+       "0                              'classic country pop'     NaN            NaN   \n",
+       "1  'alternative metal' 'alternative rock' 'nu met...  2000.0       pop rock   \n",
+       "2  'alternative metal' 'canadian rock' 'funk meta...  2000.0  canadian rock   \n",
+       "3                                                NaN  2020.0            NaN   \n",
+       "4                               'modern country pop'  2020.0            NaN   \n",
+       "\n",
+       "  artist_id main_genre         spotify_song_id       spotify_artist_id  \\\n",
+       "0  artist_1        pop                     NaN  4AIEGdwDzPELXYgM5JaEY5   \n",
+       "1  artist_2      metal  2ffJZ2r8HxI5DHcmf3BO6c  694QW15WkebjcrWgQHzRYF   \n",
+       "2  artist_3      metal  5KiY8SZEnvCPyIEkFGRR3y  0niJkG4tKkne3zwr7I8n9n   \n",
+       "3  artist_4        NaN  01TtAcUqyLCRBZq4ZZiQWS  17BfKBemmMGO5ZAK25wraW   \n",
+       "4  artist_5        pop  3zUecdrWC3IqrNSjhnoF3G  4GGfAshSkqoxpZdoaHm7ky   \n",
+       "\n",
+       "                                   chords_simplified  \\\n",
+       "0  <intro_1> C <verse_1> F C E7 Amin C F C G7 C F...   \n",
+       "1  <intro_1> E D A E D A <verse_1> E D A E D A E ...   \n",
+       "2  <intro_1> Csmin <verse_1> A Csmin A Csmin A Cs...   \n",
+       "3  <intro_1> D Dmaj7 D Dmaj7 <verse_1> Emin A D G...   \n",
+       "4  <intro_1> C <verse_1> G C G C <chorus_1> F Dmi...   \n",
+       "\n",
+       "                                           roman_seq  \\\n",
+       "0  <intro_1> I <verse_1> IV I III7 vi I IV I V7 I...   \n",
+       "1  <intro_1> V IV I V IV I <verse_1> V IV I V IV ...   \n",
+       "2  <intro_1> i <verse_1> bVI i bVI i bVI i bVI bV...   \n",
+       "3  <intro_1> I I I I <verse_1> ii V I IV ii V I I...   \n",
+       "4  <intro_1> I <verse_1> V I V I <chorus_1> IV ii...   \n",
+       "\n",
+       "                                       chroma_matrix  \n",
+       "0  [[1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0], [1, 0, ...  \n",
+       "1  [[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1], [1, 0, ...  \n",
+       "2  [[1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0], [1, 0, ...  \n",
+       "3  [[1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0], [1, 0, ...  \n",
+       "4  [[1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0], [0, 0, ...  "
+      ]
+     },
+     "execution_count": 44,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "id": "a2047927-ebcc-48d6-942b-e02ce6ab6665",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([[1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
+       "       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],\n",
+       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
+       "       [0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1],\n",
+       "       [1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0],\n",
+       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
+       "       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],\n",
+       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
+       "       [0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1],\n",
+       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
+       "       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],\n",
+       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
+       "       [0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1],\n",
+       "       [1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0],\n",
+       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
+       "       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],\n",
+       "       [0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1],\n",
+       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
+       "       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],\n",
+       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
+       "       [0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1],\n",
+       "       [1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0],\n",
+       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
+       "       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],\n",
+       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
+       "       [0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1],\n",
+       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
+       "       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],\n",
+       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
+       "       [0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1],\n",
+       "       [1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0],\n",
+       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
+       "       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],\n",
+       "       [0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1],\n",
+       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
+       "       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],\n",
+       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
+       "       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],\n",
+       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
+       "       [0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1],\n",
+       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
+       "       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],\n",
+       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
+       "       [0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1],\n",
+       "       [1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0],\n",
+       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
+       "       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],\n",
+       "       [0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1],\n",
+       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
+       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
+       "       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],\n",
+       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
+       "       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],\n",
+       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
+       "       [0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1],\n",
+       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
+       "       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],\n",
+       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
+       "       [0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1],\n",
+       "       [1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0],\n",
+       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
+       "       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],\n",
+       "       [0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1],\n",
+       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
+       "       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],\n",
+       "       [0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1],\n",
+       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0]])"
+      ]
+     },
+     "execution_count": 47,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df['chroma_matrix'][0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
    "id": "6d084835-4b26-4921-b12d-6c16a978b7c1",
    "metadata": {},
    "outputs": [],
+   "source": [
+    "# with open(\"C:/Users/liu.xuanc/Desktop/Code/TLDChallenge/test_chords_matrix.pkl\", 'wb') as f:\n",
+    "#     pickle.dump(df['chroma_matrix'][0], f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "id": "76006541-0468-4dfd-a64a-0a3083280b1d",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "19a2269b-be8e-4e2b-a9e3-a7d3a5c0c397",
+   "metadata": {},
+   "source": [
+    "Calculating task:\n",
+    "1. Frequency of each hyperedge\n",
+    "2. O-information (global and local)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "fd7751a2-a48e-4e05-bc46-55b6f9c6394d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def analyze_roman_statistics(roman_series, top_n=50):\n",
+    "    \"\"\"\n",
+    "    Statistics for the Roman Numeral Chord Sequence.\n",
+    "    1. Remove structure tags (e.g. <intro>)\n",
+    "    2. Count the frequency and frequency of each chord\n",
+    "    \"\"\"\n",
+    "    print(\"Processing data and counting frequencies...\")\n",
+    "    \n",
+    "    # 1. Split each string into a list of words\n",
+    "    # 2. explode() expands the list into a long column\n",
+    "    all_tokens = roman_series.str.split().explode()\n",
+    "    \n",
+    "    # 3. Filter: Keep only content that does NOT start with '<' (remove tags)\n",
+    "    # ~ means logical NOT\n",
+    "    chord_tokens = all_tokens[~all_tokens.str.startswith('<', na=False)]\n",
+    "    \n",
+    "    # 4. Statistical Frequency\n",
+    "    counts = chord_tokens.value_counts()\n",
+    "    \n",
+    "    # 5. Calculate frequency (percentage)\n",
+    "    total_count = counts.sum()\n",
+    "    frequencies = (counts / total_count) * 100\n",
+    "    \n",
+    "    # 6. Build result DataFrame\n",
+    "    stats_df = pd.DataFrame({\n",
+    "        'Count': counts,\n",
+    "        'Frequency (%)': frequencies.round(4) # Keep 4 decimal places\n",
+    "    })\n",
+    "    \n",
+    "    print(f\"=== Statistics Report ===\")\n",
+    "    print(f\"Total Chord Tokens: {total_count}\")\n",
+    "    print(f\"Unique Chord Types: {len(counts)}\")\n",
+    "    print(\"-\" * 30)\n",
+    "    print(f\"Top {top_n} Roman Numerals:\")\n",
+    "    print(stats_df.head(top_n))\n",
+    "    \n",
+    "    return stats_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8f76e0bd-1e21-485b-8b62-7e58713a76db",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ==========================================\n",
+    "# Run the analysis\n",
+    "# ==========================================\n",
+    "\n",
+    "# Make sure you have executed the step to generate 'roman_seq' before this\n",
+    "# df['roman_seq'] is the column we generated in the previous step\n",
+    "roman_stats = analyze_roman_statistics(df['roman_seq'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8b609dde-9c58-48a1-8a42-13f66bcc65df",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "roman_stats['Pitches'] = roman_stats.index.map(lambda x: map_roman_to_pitches(x)[0])\n",
+    "other_cols = [c for c in roman_stats.columns if c != 'Pitches']\n",
+    "new_order = ['Pitches'] + other_cols\n",
+    "roman_stats = roman_stats[new_order]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ab6923ee-3abf-433d-8bd6-ae4a3ed5b707",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def calculate_inclusive_statistics(stats_df):\n",
+    "    \"\"\"\n",
+    "    计算包含性频次 (Inclusive Frequency)。\n",
+    "    逻辑：如果和弦 A 的组成音是和弦 B 的子集 (A ⊆ B)，\n",
+    "          那么将 B 的出现次数加到 A 上。\n",
+    "    \"\"\"\n",
+    "    # 1. 创建副本，避免修改原始数据\n",
+    "    df_incl = stats_df.copy()\n",
+    "    \n",
+    "    # 初始化新列：包含性计数一开始等于原始计数\n",
+    "    df_incl['Inclusive_Count'] = df_incl['Count']\n",
+    "    \n",
+    "    # 2. 准备数据：将 Pitches 转换为 Python Set (集合)，极大加速比对\n",
+    "    # 字典结构: {'I': {0, 4, 7}, 'V7': {2, 5, 7, 11}, ...}\n",
+    "    chord_definitions = {\n",
+    "        label: set(pitches) \n",
+    "        for label, pitches in df_incl['Pitches'].items()\n",
+    "    }\n",
+    "    \n",
+    "    # 获取原始计数的字典，方便快速查找\n",
+    "    raw_counts = df_incl['Count'].to_dict()\n",
+    "    \n",
+    "    print(f\"正在计算 {len(chord_definitions)} 个和弦类型之间的包含关系...\")\n",
+    "    \n",
+    "    # 3. 双重循环遍历 (N * N)\n",
+    "    # 这是一个简单的矩阵遍历，对于 N=750 来说非常快\n",
+    "    \n",
+    "    # 遍历每一个“潜在的子集和弦” (Child)\n",
+    "    for child_label, child_notes in tqdm(chord_definitions.items()):\n",
+    "        added_count = 0\n",
+    "        \n",
+    "        # 遍历每一个“潜在的父集和弦” (Parent)\n",
+    "        for parent_label, parent_notes in chord_definitions.items():\n",
+    "            \n",
+    "            # 跳过自己 (自己已经被初始化在 Inclusive_Count 里了)\n",
+    "            if child_label == parent_label:\n",
+    "                continue\n",
+    "            \n",
+    "            # 核心判断：Child 是否被 Parent 完全包含？\n",
+    "            # issubset() 是数学上的 A ⊆ B\n",
+    "            if child_notes.issubset(parent_notes):\n",
+    "                # 如果包含，把 Parent 的原始次数贡献给 Child\n",
+    "                added_count += raw_counts[parent_label]\n",
+    "        \n",
+    "        # 更新 DataFrame\n",
+    "        df_incl.at[child_label, 'Inclusive_Count'] += added_count\n",
+    "\n",
+    "    # 4. 重新计算频率\n",
+    "    # 注意：分母依然使用“总 Token 数”，这样包含性频率之和会超过 100%，\n",
+    "    # 这代表了“该和声结构出现的总概率”。\n",
+    "    total_tokens = stats_df['Count'].sum()\n",
+    "    df_incl['Inclusive_Freq (%)'] = (df_incl['Inclusive_Count'] / total_tokens) * 100\n",
+    "    \n",
+    "    # 5. 整理列顺序，把 Inclusive 放在前面方便看\n",
+    "    cols = ['Inclusive_Count', 'Inclusive_Freq (%)', 'Count', 'Frequency (%)', 'Pitches']\n",
+    "    # 仅保留存在的列\n",
+    "    cols = [c for c in cols if c in df_incl.columns]\n",
+    "    \n",
+    "    # 按包含性计数降序排列\n",
+    "    return df_incl[cols].sort_values(by='Inclusive_Count', ascending=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4b424d62-d430-4426-b41a-9b1db60b5d20",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 1. 创建副本并初始化新列\n",
+    "# 我们直接操作 roman_stats，先创建一个 Inclusive_Count 列，初始值等于原始 Count\n",
+    "roman_stats['Inclusive_Count'] = roman_stats['Count']\n",
+    "\n",
+    "# 2. 准备数据：将 Pitches 转换为 Python 集合 (Set) 以加速比对\n",
+    "# 生成一个字典: {'I': {0, 4, 7}, 'V7': {2, 5, 7, 11}, ...}\n",
+    "chord_definitions = {\n",
+    "    label: set(pitches) \n",
+    "    for label, pitches in roman_stats['Pitches'].items()\n",
+    "}\n",
+    "\n",
+    "# 获取原始计数的字典，方便快速查找，避免在循环中反复访问 DataFrame\n",
+    "raw_counts = roman_stats['Count'].to_dict()\n",
+    "\n",
+    "print(f\"正在计算 {len(chord_definitions)} 个和弦类型之间的包含关系...\")\n",
+    "\n",
+    "# 3. 双重循环计算包含关系\n",
+    "# 外层循环：遍历每一个“潜在的子集和弦” (Child)\n",
+    "for child_label, child_notes in tqdm(chord_definitions.items()):\n",
+    "    added_count = 0\n",
+    "    \n",
+    "    # 内层循环：遍历每一个“潜在的父集和弦” (Parent)\n",
+    "    for parent_label, parent_notes in chord_definitions.items():\n",
+    "        \n",
+    "        # 跳过自己\n",
+    "        if child_label == parent_label:\n",
+    "            continue\n",
+    "        \n",
+    "        # 核心判断：Child 的音阶是否完全包含于 Parent 中？\n",
+    "        # issubset() 是集合运算 A ⊆ B\n",
+    "        if child_notes.issubset(parent_notes):\n",
+    "            # 如果包含，累加 Parent 的原始次数\n",
+    "            added_count += raw_counts[parent_label]\n",
+    "    \n",
+    "    # 将累加的额外次数加到 Inclusive_Count 中\n",
+    "    roman_stats.at[child_label, 'Inclusive_Count'] += added_count\n",
+    "\n",
+    "# 4. 计算包含性频率 (Inclusive Frequency)\n",
+    "# 分母使用原始的总和弦数 (Frequency 可能会超过 100%)\n",
+    "total_original_tokens = roman_stats['Count'].sum()\n",
+    "roman_stats['Inclusive_Freq (%)'] = (roman_stats['Inclusive_Count'] / total_original_tokens) * 100\n",
+    "\n",
+    "# 5. 整理列顺序并排序\n",
+    "# 把 Inclusive 的数据放到前面方便观察\n",
+    "cols = ['Pitches', 'Inclusive_Count', 'Inclusive_Freq (%)', 'Count', 'Frequency (%)']\n",
+    "roman_stats = roman_stats[cols].sort_values(by='Inclusive_Count', ascending=False)\n",
+    "\n",
+    "# 6. 查看结果\n",
+    "print(\"\\n=== 更新后的统计表 (Top 10) ===\")\n",
+    "print(roman_stats.head(10))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5d50de13-005d-4066-ab9a-7d24a64ffc75",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "08f82abf-66d1-4227-a714-b0564c54c06f",
+   "metadata": {},
+   "source": [
+    "Testing task:\n",
+    "1. The big matrix containing all songs\n",
+    "2. Divided big matrixes containing songs in their genre"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d7d6aef8-4ac9-48f5-8110-cdc3f7646f4b",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3ba99f27-1991-41e7-a183-85df750886f3",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "622bb96f-c985-4e4e-a800-e26edca92e85",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2b9ee7ba-0c9a-4894-b060-c3c98178a2da",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c34f672f-e010-43b4-8d5b-650591993a78",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e31a10d4-f5af-4fdc-9ee5-e7242ab8acab",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d6959d67-a0a0-4242-84b5-53f318d46a24",
+   "metadata": {},
+   "outputs": [],
    "source": []
   }
  ],

From 5a5fb103a6048fef31213d4df7c3c45bdb509d94 Mon Sep 17 00:00:00 2001
From: pierrick <pierrick.lry@gmail.com>
Date: Sat, 22 Nov 2025 16:21:31 +0100
Subject: [PATCH 08/14] allow edge task level with noreadout

---
 topobench/nn/readouts/base.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/topobench/nn/readouts/base.py b/topobench/nn/readouts/base.py
index d6704b12..a2f20cdc 100755
--- a/topobench/nn/readouts/base.py
+++ b/topobench/nn/readouts/base.py
@@ -42,9 +42,9 @@ def __init__(
             if hidden_dim != out_channels or logits_linear_layer
             else torch.nn.Identity()
         )
-        assert task_level in ["graph", "node"] or self.name == "NoReadOut", (
-            "Invalid task_level"
-        )
+        assert task_level in ["graph", "node"] or (
+            self.name == "NoReadOut" and task_level == "edge"
+        ), "Invalid task_level"
         self.task_level = task_level
         self.logits_linear_layer = logits_linear_layer
 

From c30a047e4640b0caf306c630418f6342040ca9dc Mon Sep 17 00:00:00 2001
From: pierrick <pierrick.lry@gmail.com>
Date: Mon, 24 Nov 2025 00:48:01 +0100
Subject: [PATCH 09/14] uploaded real data, pipeline passes

---
 configs/dataset/hypergraph/chordonomicon.yaml |   5 +-
 test/pipeline/test_pipeline.py                | 149 +++++++++++++++---
 topobench/data/datasets/chordonomicon.py      |   8 +-
 3 files changed, 139 insertions(+), 23 deletions(-)

diff --git a/configs/dataset/hypergraph/chordonomicon.yaml b/configs/dataset/hypergraph/chordonomicon.yaml
index 7c203e0c..007a6435 100644
--- a/configs/dataset/hypergraph/chordonomicon.yaml
+++ b/configs/dataset/hypergraph/chordonomicon.yaml
@@ -11,7 +11,8 @@ loader:
 parameters:
   num_features: 1
   num_classes: 1
-  # num_nodes: 3224
+  num_edge_features: 1
+  num_node_features: 12
   task: regression
   loss_type: mse
   monitor_metric: mae
@@ -23,7 +24,7 @@ split_params:
   data_seed: 0
   split_type: random #'k-fold' # either "k-fold" or "random" strategies
   k: 10 # for "k-fold" Cross-Validation
-  train_prop: 0.5 # for "random" strategy splitting
+  train_prop: 0.8 # for "random" strategy splitting
   standardize: False
   data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name}
 
diff --git a/test/pipeline/test_pipeline.py b/test/pipeline/test_pipeline.py
index 78598715..00c964d0 100644
--- a/test/pipeline/test_pipeline.py
+++ b/test/pipeline/test_pipeline.py
@@ -2,10 +2,17 @@
 
 import hydra
 from test._utils.simplified_pipeline import run
-
-
-DATASET = "graph/MUTAG"                                                 # ADD YOUR DATASET HERE
-MODELS   = ["graph/gcn", "cell/topotune", "simplicial/topotune"]        # ADD ONE OR SEVERAL MODELS OF YOUR CHOICE HERE
+import lightning as pl
+import torch
+from omegaconf import OmegaConf
+from hydra.utils import instantiate
+from topobench.data.preprocessor import PreProcessor
+from topobench.dataloader import TBDataloader
+from topobench.loss.loss import TBLoss
+from topobench.model.model import TBModel
+from topobench.evaluator.evaluator import TBEvaluator
+from topobench.nn.readouts import identical
+from topobench.optimizer import TBOptimizer
 
 
 class TestPipeline:
@@ -17,19 +24,123 @@ def setup_method(self):
     
     def test_pipeline(self):
         """Test pipeline."""
-        with hydra.initialize(config_path="../../configs", job_name="job"):
-            for MODEL in MODELS:
-                cfg = hydra.compose(
-                    config_name="run.yaml",
-                    overrides=[
-                        f"model={MODEL}",
-                        f"dataset={DATASET}", # IF YOU IMPLEMENT A LARGE DATASET WITH AN OPTION TO USE A SLICE OF IT, ADD BELOW THE CORRESPONDING OPTION
-                        "trainer.max_epochs=2",
-                        "trainer.min_epochs=1",
-                        "trainer.check_val_every_n_epoch=1",
-                        "paths=test",
-                        "callbacks=model_checkpoint",
-                    ],
-                    return_hydra_config=True
+        config_dataset = OmegaConf.load("configs/dataset/hypergraph/chordonomicon.yaml")
+        config_dataset.split_params.data_split_dir = "datasets/data_splits/chordonomicon"
+        config_dataset.loader.parameters.data_dir = "datasets/hypergraph/chords"
+        config_loader = {"_target_":"topobench.data.loaders.ChordonomiconDatasetLoader",
+                        "parameters":
+                        {"data_domain": "hypergraph",
+                            "data_type": "chords",
+                            "data_name": "chordonomicon",
+                            "data_dir": "datasets/hypergraph/chords"
+                            }
+                        }
+
+        config_evaluator = {"task": "regression",
+                            "num_classes": config_dataset.parameters.num_classes,
+                            "metrics": ["rmse", "mse", "mae"]}
+
+        config_loss = {
+            "dataset_loss": 
+                {
+                    "task": "regression", 
+                    "loss_type": "mse"
+                }
+        }
+
+        config_readout = {
+            "hidden_dim": config_dataset.parameters.num_classes,
+            "out_channels": config_dataset.parameters.num_classes,
+            "task_level": config_dataset.parameters.task_level,
+            "logits_linear_layer": False,
+        }
+
+        config_optimizer = {"optimizer_id": "Adam",
+                            "parameters":
+                                {"lr": 0.01,"weight_decay": 0.0005}
+                            }
+
+        # backbone class definition
+        class ModelPipeLine(pl.LightningModule):
+            """Custom model pipeline for testing.
+
+            Parameters
+            ----------
+            dim_in_node : int
+                Dimension of input node features.
+            dim_hidden : int
+                Dimension of hidden layers.
+            dim_out : int
+                Dimension of output features.
+            """
+            def __init__(self,
+                        dim_in_node,  #batch.x.size(0)+batch.x_hyperedges.shape[1]
+                        dim_hidden,
+                        dim_out,
+                        ):
+                super().__init__()
+                self.dim_hidden = dim_hidden
+                self.linear_node_0 = torch.nn.Linear(dim_in_node, dim_hidden)
+                self.linear_hyperedge_0 = torch.nn.Linear(dim_hidden, dim_out)
+
+            def forward(self, batch):
+                """Forward pass.
+
+                Parameters
+                ----------
+                batch : torch_geometric.data.Data
+                    Input batch of data.
+
+                Returns
+                -------
+                dict
+                    Output dictionary containing node representation and hyperedge logits.
+                """
+                x_node = torch.concat((batch.x, torch.sparse.mm(batch.incidence_hyperedges, batch.x_hyperedges)), dim=1)
+                h_node = self.linear_node_0(x_node)
+                h_node = torch.relu(h_node)
+                h_hyperedge = torch.mm(batch.incidence_hyperedges.T, h_node)
+                h_hyperedge = self.linear_hyperedge_0(h_hyperedge)
+                model_out =  {'h_node': h_node,
+                            'h_hyperedge': h_hyperedge,
+                            "labels": batch.y_hyperedges}
+                model_out["logits"] = model_out["h_hyperedge"]
+                return model_out
+
+        # dataset
+        dataset_loader = instantiate(config_loader)
+        dataset, dataset_dir = dataset_loader.load()
+        preprocessor = PreProcessor(dataset, dataset_dir)
+        dataset_train, dataset_val, dataset_test = preprocessor.load_dataset_splits(config_dataset.split_params)
+        datamodule = TBDataloader(
+                    dataset_train=dataset_train,
+                    dataset_val=dataset_val,
+                    dataset_test=dataset_test,
+                    **config_dataset.get("dataloader_params", {}),
                 )
-                run(cfg)
\ No newline at end of file
+        dataloader_train = datamodule.train_dataloader()
+
+        # model
+        backbone = ModelPipeLine(dim_in_node=config_dataset.parameters.num_node_features+config_dataset.parameters.num_edge_features,
+                                dim_hidden=10,
+                                dim_out=config_dataset.parameters.num_classes)
+        loss = TBLoss(config_loss["dataset_loss"])
+        optimizer = TBOptimizer(**config_optimizer)
+        readout = identical.NoReadOut(**config_readout)
+        evaluator = TBEvaluator(**config_evaluator)
+        optimizer = TBOptimizer(**config_optimizer)
+        model = TBModel(backbone=backbone,
+                        readout=readout,
+                        loss=loss,
+                        optimizer=optimizer,
+                        evaluator=evaluator,
+                        compile=False)
+
+        # train
+        trainer = pl.Trainer(max_epochs=500, accelerator="cpu", enable_progress_bar=False, log_every_n_steps=1)
+        trainer.fit(model, datamodule)
+        trainer.test(model, datamodule)
+        test_metrics = trainer.callback_metrics
+        print('      Testing metrics\n', '-'*25)
+        for key in test_metrics:
+            print('{:<20s} {:>5.4f}'.format(key+':', test_metrics[key].item()))
diff --git a/topobench/data/datasets/chordonomicon.py b/topobench/data/datasets/chordonomicon.py
index 42e41e1e..4e019a56 100644
--- a/topobench/data/datasets/chordonomicon.py
+++ b/topobench/data/datasets/chordonomicon.py
@@ -74,8 +74,12 @@ def process(self):
         incidence_hyperedges = torch.sparse_coo_tensor(
             indices, torch.ones(indices.shape[1])
         ).coalesce()
-        x_hyperedges = torch.tensor(df["frequency"].values).unsqueeze(1)
-        y_hyperedges = torch.tensor(df["local_o_info"].values)
+        x_hyperedges = torch.tensor(
+            df["frequency"].values, dtype=torch.float32
+        ).unsqueeze(1)
+        y_hyperedges = torch.tensor(
+            df["local_o_info"].values, dtype=torch.float32
+        )
         data = Data(
             incidence_hyperedges=incidence_hyperedges,
             num_hyperedges=incidence_hyperedges.size(1),

From ba85db346404a38b5e04cbe54edb7c6c27bcb5fc Mon Sep 17 00:00:00 2001
From: pierrick <pierrick.lry@gmail.com>
Date: Mon, 24 Nov 2025 01:01:21 +0100
Subject: [PATCH 10/14] reducing number of epoch speeds up pipeline test

---
 test/pipeline/test_pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/pipeline/test_pipeline.py b/test/pipeline/test_pipeline.py
index 00c964d0..bd0a9a8a 100644
--- a/test/pipeline/test_pipeline.py
+++ b/test/pipeline/test_pipeline.py
@@ -137,7 +137,7 @@ def forward(self, batch):
                         compile=False)
 
         # train
-        trainer = pl.Trainer(max_epochs=500, accelerator="cpu", enable_progress_bar=False, log_every_n_steps=1)
+        trainer = pl.Trainer(max_epochs=3, accelerator="cpu", enable_progress_bar=False, log_every_n_steps=1)
         trainer.fit(model, datamodule)
         trainer.test(model, datamodule)
         test_metrics = trainer.callback_metrics

From 062109676c41a392dd13f1684260734a70af0d63 Mon Sep 17 00:00:00 2001
From: pierrick <pierrick.lry@gmail.com>
Date: Mon, 24 Nov 2025 16:01:16 +0100
Subject: [PATCH 11/14] delete processing notebook

---
 TDL-chords-data-cleaning.ipynb | 2225 --------------------------------
 1 file changed, 2225 deletions(-)
 delete mode 100644 TDL-chords-data-cleaning.ipynb

diff --git a/TDL-chords-data-cleaning.ipynb b/TDL-chords-data-cleaning.ipynb
deleted file mode 100644
index 5a0060ed..00000000
--- a/TDL-chords-data-cleaning.ipynb
+++ /dev/null
@@ -1,2225 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 36,
-   "id": "cc404196-bd66-43b2-8f3b-9602f6ccf58f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import numpy as np\n",
-    "import pandas as pd\n",
-    "import re\n",
-    "import pickle\n",
-    "from tqdm import tqdm\n",
-    "from collections import defaultdict"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "aeaa2ed4-6d9e-4cd7-90f3-402cd3e92440",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "## Uun this code when using it for the first time for loading the dataset\n",
-    "# pip install huggingface_hub"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "94d6aced-7399-4917-bd60-a3997d5831af",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "C:\\Users\\liu.xuanc\\AppData\\Local\\Temp\\ipykernel_28784\\2761146993.py:1: DtypeWarning: Columns (2,3,5,6,7,8,9) have mixed types. Specify dtype option on import or set low_memory=False.\n",
-      "  df = pd.read_csv(\"hf://datasets/ailsntua/Chordonomicon/chordonomicon_v2.csv\")\n"
-     ]
-    }
-   ],
-   "source": [
-    "df = pd.read_csv(\"hf://datasets/ailsntua/Chordonomicon/chordonomicon_v2.csv\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2ea0931b-cc27-4715-9773-ec765838c3e6",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "id": "72ba2952-93f9-4961-a67f-159a27cdd9e2",
-   "metadata": {},
-   "source": [
-    "First step analysis of chords"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "8594f632-38dd-48b8-905d-89aa12613d9c",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>id</th>\n",
-       "      <th>chords</th>\n",
-       "      <th>release_date</th>\n",
-       "      <th>genres</th>\n",
-       "      <th>decade</th>\n",
-       "      <th>rock_genre</th>\n",
-       "      <th>artist_id</th>\n",
-       "      <th>main_genre</th>\n",
-       "      <th>spotify_song_id</th>\n",
-       "      <th>spotify_artist_id</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>1</td>\n",
-       "      <td>&lt;intro_1&gt; C &lt;verse_1&gt; F C E7 Amin C F C G7 C F...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>'classic country pop'</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>artist_1</td>\n",
-       "      <td>pop</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>4AIEGdwDzPELXYgM5JaEY5</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>2</td>\n",
-       "      <td>&lt;intro_1&gt; E D A/Cs E D A/Cs &lt;verse_1&gt; E D A/Cs...</td>\n",
-       "      <td>2003-01-01</td>\n",
-       "      <td>'alternative metal' 'alternative rock' 'nu met...</td>\n",
-       "      <td>2000.0</td>\n",
-       "      <td>pop rock</td>\n",
-       "      <td>artist_2</td>\n",
-       "      <td>metal</td>\n",
-       "      <td>2ffJZ2r8HxI5DHcmf3BO6c</td>\n",
-       "      <td>694QW15WkebjcrWgQHzRYF</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>3</td>\n",
-       "      <td>&lt;intro_1&gt; Csmin &lt;verse_1&gt; A Csmin A Csmin A Cs...</td>\n",
-       "      <td>2003-01-01</td>\n",
-       "      <td>'alternative metal' 'canadian rock' 'funk meta...</td>\n",
-       "      <td>2000.0</td>\n",
-       "      <td>canadian rock</td>\n",
-       "      <td>artist_3</td>\n",
-       "      <td>metal</td>\n",
-       "      <td>5KiY8SZEnvCPyIEkFGRR3y</td>\n",
-       "      <td>0niJkG4tKkne3zwr7I8n9n</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>4</td>\n",
-       "      <td>&lt;intro_1&gt; D Dmaj7 D Dmaj7 &lt;verse_1&gt; Emin A D G...</td>\n",
-       "      <td>2022-09-23</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>2020.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>artist_4</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>01TtAcUqyLCRBZq4ZZiQWS</td>\n",
-       "      <td>17BfKBemmMGO5ZAK25wraW</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>5</td>\n",
-       "      <td>&lt;intro_1&gt; C &lt;verse_1&gt; G C G C &lt;chorus_1&gt; F Dmi...</td>\n",
-       "      <td>2023-02-10</td>\n",
-       "      <td>'modern country pop'</td>\n",
-       "      <td>2020.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>artist_5</td>\n",
-       "      <td>pop</td>\n",
-       "      <td>3zUecdrWC3IqrNSjhnoF3G</td>\n",
-       "      <td>4GGfAshSkqoxpZdoaHm7ky</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   id                                             chords release_date  \\\n",
-       "0   1  <intro_1> C <verse_1> F C E7 Amin C F C G7 C F...          NaN   \n",
-       "1   2  <intro_1> E D A/Cs E D A/Cs <verse_1> E D A/Cs...   2003-01-01   \n",
-       "2   3  <intro_1> Csmin <verse_1> A Csmin A Csmin A Cs...   2003-01-01   \n",
-       "3   4  <intro_1> D Dmaj7 D Dmaj7 <verse_1> Emin A D G...   2022-09-23   \n",
-       "4   5  <intro_1> C <verse_1> G C G C <chorus_1> F Dmi...   2023-02-10   \n",
-       "\n",
-       "                                              genres  decade     rock_genre  \\\n",
-       "0                              'classic country pop'     NaN            NaN   \n",
-       "1  'alternative metal' 'alternative rock' 'nu met...  2000.0       pop rock   \n",
-       "2  'alternative metal' 'canadian rock' 'funk meta...  2000.0  canadian rock   \n",
-       "3                                                NaN  2020.0            NaN   \n",
-       "4                               'modern country pop'  2020.0            NaN   \n",
-       "\n",
-       "  artist_id main_genre         spotify_song_id       spotify_artist_id  \n",
-       "0  artist_1        pop                     NaN  4AIEGdwDzPELXYgM5JaEY5  \n",
-       "1  artist_2      metal  2ffJZ2r8HxI5DHcmf3BO6c  694QW15WkebjcrWgQHzRYF  \n",
-       "2  artist_3      metal  5KiY8SZEnvCPyIEkFGRR3y  0niJkG4tKkne3zwr7I8n9n  \n",
-       "3  artist_4        NaN  01TtAcUqyLCRBZq4ZZiQWS  17BfKBemmMGO5ZAK25wraW  \n",
-       "4  artist_5        pop  3zUecdrWC3IqrNSjhnoF3G  4GGfAshSkqoxpZdoaHm7ky  "
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "48ff08d8-f8d2-472e-94c2-4f2b0fe4b2c9",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "0         <intro_1> C <verse_1> F C E7 Amin C F C G7 C F...\n",
-       "1         <intro_1> E D A/Cs E D A/Cs <verse_1> E D A/Cs...\n",
-       "2         <intro_1> Csmin <verse_1> A Csmin A Csmin A Cs...\n",
-       "3         <intro_1> D Dmaj7 D Dmaj7 <verse_1> Emin A D G...\n",
-       "4         <intro_1> C <verse_1> G C G C <chorus_1> F Dmi...\n",
-       "                                ...                        \n",
-       "679802    D G D A G D A D G D A G D A D G D A G D A D G ...\n",
-       "679803    G Gadd13 G7 Gadd13 G Emin A7 Emin A7 Emin A7 E...\n",
-       "679804    E Fs E Fs E Fs E Fs E Fs E Fs B Cs Fs B Cs Fs ...\n",
-       "679805    E Csmin Fsmin B E Csmin Fsmin B E Csmin Fsmin ...\n",
-       "679806    A B7 E7 A Fs7 A E7 A D A D B7 A B7 E7 A Fs7 A ...\n",
-       "Name: chords, Length: 679807, dtype: object"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df['chords']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "13de4c2d-7941-4f47-bb9a-4c5e2774eeb2",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "NOTES = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']\n",
-    "\n",
-    "# A simple homophone conversion dictionary (for standardized input)\n",
-    "FLAT_TO_SHARP = {\n",
-    "    'Db': 'C#', 'Eb': 'D#', 'Gb': 'F#', 'Ab': 'G#', 'Bb': 'A#',\n",
-    "    'db': 'C#', 'eb': 'D#', 'gb': 'F#', 'ab': 'G#', 'bb': 'A#'\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "4dbf80f4-1144-4c47-bfdc-351b05aebdda",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def analyze_pandas_series(series_data, top_n=50):\n",
-    "    print(\"Removing labels...\")\n",
-    "    clean_series = series_data.str.replace(r'<[^>]+>', ' ', regex=True)\n",
-    "\n",
-    "    print(\"Expliciting and counting...\")\n",
-    "    # 1. .str.split(): to list ['C', 'F', 'G']\n",
-    "    # 2. .explode(): turns lists to one list\n",
-    "    # 3. .value_counts(): count\n",
-    "    chord_counts = clean_series.str.split().explode().value_counts()\n",
-    "    \n",
-    "    print(f\"=== Finish ===\")\n",
-    "    print(f\"Chords amount: {chord_counts.sum()}\")\n",
-    "    print(f\"Chords type amount: {len(chord_counts)}\")\n",
-    "    print(\"-\" * 30)\n",
-    "    \n",
-    "    print(f\"Top {top_n} commonly used chords:\")\n",
-    "    print(chord_counts.head(top_n))\n",
-    "\n",
-    "    print(\"-\" * 30)\n",
-    "    print(\"Examples of long tail data (with fewer occurrences):\")\n",
-    "    print(chord_counts.tail(20).index.tolist())\n",
-    "    \n",
-    "    return chord_counts"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "632ac7d5-4936-483b-be8b-d327dcc133ad",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Dealing with slash chords...\n",
-      "Finish!\n"
-     ]
-    }
-   ],
-   "source": [
-    "def simplify_slash_chord(text):\n",
-    "    if not isinstance(text, str): return \"\"\n",
-    "    \n",
-    "    chords = text.split()\n",
-    "    simplified = [c.split('/')[0] for c in chords]\n",
-    "    return \" \".join(simplified)\n",
-    "\n",
-    "print(\"Dealing with slash chords...\")\n",
-    "df['chords_simplified'] = df['chords'].apply(simplify_slash_chord)\n",
-    "print(\"Finish!\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "4c3336c4-e86e-4765-8c46-cb7dc3b733b0",
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Removing labels...\n",
-      "Expliciting and counting...\n",
-      "=== Finish ===\n",
-      "Chords amount: 51994634\n",
-      "Chords type amount: 749\n",
-      "------------------------------\n",
-      "Top 50 commonly used chords:\n",
-      "chords_simplified\n",
-      "G        7161058\n",
-      "C        5945092\n",
-      "D        5413274\n",
-      "A        3952888\n",
-      "F        3389952\n",
-      "Amin     2958979\n",
-      "E        2719857\n",
-      "Emin     2662799\n",
-      "Bmin     1446212\n",
-      "B        1384402\n",
-      "Dmin     1265370\n",
-      "Bb       1033272\n",
-      "Fsmin     887691\n",
-      "Fs        692455\n",
-      "Csmin     613688\n",
-      "Gmin      556347\n",
-      "Eb        508770\n",
-      "Cmin      409476\n",
-      "Ab        365022\n",
-      "Cs        328877\n",
-      "Gsmin     324517\n",
-      "A7        302780\n",
-      "Fmin      294103\n",
-      "D7        288826\n",
-      "Amin7     287833\n",
-      "Emin7     284727\n",
-      "E7        271749\n",
-      "Gs        267858\n",
-      "B7        253029\n",
-      "G7        248206\n",
-      "Cadd9     235057\n",
-      "Db        194678\n",
-      "As        166455\n",
-      "Bmin7     164123\n",
-      "Fmaj7     162679\n",
-      "Dmin7     158330\n",
-      "Cmaj7     155171\n",
-      "Ds        148442\n",
-      "C7        147180\n",
-      "Bbmin     141925\n",
-      "Dsmin     115335\n",
-      "Ano3d     105842\n",
-      "Dsus4     103025\n",
-      "Dno3d     102338\n",
-      "Gb         99587\n",
-      "Gno3d      98220\n",
-      "Eno3d      90866\n",
-      "Dsus2      89184\n",
-      "Dmaj7      88411\n",
-      "Asus4      88285\n",
-      "Name: count, dtype: int64\n",
-      "------------------------------\n",
-      "Examples of long tail data (with fewer occurrences):\n",
-      "['Cs13b9', 'Edimb7', 'E11s', 'Dbmaj11', 'Bminmaj9', 'Gs13b9', 'sC', 'Fsminmaj13', 'Fsaugmaj11', 'Bbdim13b9', 'Eminmaj13', 'A11s', 'Abdim9', 'Eb11b9', 'Fsdim11', 'Bbdimadd13', 'Gdim11b9', 'Dsdim9', 'Bdim11b9', 'Fminmaj13']\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Analyze simplified chords\n",
-    "counts = analyze_pandas_series(df['chords_simplified'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "168d4085-71a8-43b0-8492-0d12ac4b32e9",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Top 50 coverage ratie: 0.9458\n",
-      "Number of long tail chords type: 699\n",
-      "\n",
-      " Long tail data example (Rank 51-70):\n",
-      "chords_simplified\n",
-      "Fsmin7    83258\n",
-      "Asus2     80833\n",
-      "Gmin7     79634\n",
-      "Gmaj7     79039\n",
-      "Ebmin     78919\n",
-      "Csmin7    72221\n",
-      "Cno3d     72148\n",
-      "Bno3d     71021\n",
-      "F7        66986\n",
-      "Asmin     64547\n",
-      "Amaj7     60432\n",
-      "Gadd13    59584\n",
-      "Fs7       59092\n",
-      "Cmin7     58856\n",
-      "Gsus4     46604\n",
-      "Esus4     46589\n",
-      "Fno3d     44630\n",
-      "Abmin     39087\n",
-      "Fmin7     38177\n",
-      "Fsno3d    36772\n",
-      "Name: count, dtype: int64\n",
-      "\n",
-      " Rare chords example:\n",
-      "chords_simplified\n",
-      "Cs13b9        1\n",
-      "Edimb7        1\n",
-      "E11s          1\n",
-      "Dbmaj11       1\n",
-      "Bminmaj9      1\n",
-      "Gs13b9        1\n",
-      "sC            1\n",
-      "Fsminmaj13    1\n",
-      "Fsaugmaj11    1\n",
-      "Bbdim13b9     1\n",
-      "Eminmaj13     1\n",
-      "A11s          1\n",
-      "Abdim9        1\n",
-      "Eb11b9        1\n",
-      "Fsdim11       1\n",
-      "Bbdimadd13    1\n",
-      "Gdim11b9      1\n",
-      "Dsdim9        1\n",
-      "Bdim11b9      1\n",
-      "Fminmaj13     1\n",
-      "Name: count, dtype: int64\n"
-     ]
-    }
-   ],
-   "source": [
-    "## Check the long tail data\n",
-    "total_count = counts.sum()\n",
-    "top_50_count = counts.head(50).sum()\n",
-    "\n",
-    "top_50_chords = set(counts.head(50).index)\n",
-    "\n",
-    "# 5% long tail data\n",
-    "tail_chords = [c for c in counts.index if c not in top_50_chords]\n",
-    "\n",
-    "print(f\"Top 50 coverage ratie: {top_50_count / total_count:.4f}\")\n",
-    "print(f\"Number of long tail chords type: {len(tail_chords)}\")\n",
-    "\n",
-    "print(\"\\n Long tail data example (Rank 51-70):\")\n",
-    "print(counts.iloc[50:70])\n",
-    "\n",
-    "print(\"\\n Rare chords example:\")\n",
-    "print(counts.tail(20))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "862eff6d-8e41-4574-b6ff-7e49590403e1",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "id": "a6c82e59-af50-47c0-8e69-1b84fae2728b",
-   "metadata": {},
-   "source": [
-    "Clean chords data:\n",
-    "1. Seperate one song into several segments;\n",
-    "2. Infer the key of each segment;\n",
-    "3. Transpose chords into Roman numerals."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "id": "a674a0eb-9cc9-46fd-aa74-6d9ce1575e84",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# ==========================================\n",
-    "# Global configuration and auxiliary functions\n",
-    "# ==========================================\n",
-    "\n",
-    "def parse_chord_root_quality(chord_str):\n",
-    "    \"\"\"\n",
-    "    Auxiliary function: parses a single chord string and returns (Root, Quality)\n",
-    "    For example: \"Am\" ->(\"A\", \"min\"), \"G7\" ->(\"G\", \"7\")\n",
-    "    \"\"\"\n",
-    "    chord_str = chord_str.strip()\n",
-    "    if not chord_str: return None, None\n",
-    "    \n",
-    "    for flat, sharp in FLAT_TO_SHARP.items():\n",
-    "        if chord_str.startswith(flat):\n",
-    "            chord_str = sharp + chord_str[len(flat):]\n",
-    "            break\n",
-    "    if len(chord_str) > 1 and chord_str[1] == 's': # Fs7\n",
-    "         chord_str = chord_str[0] + '#' + chord_str[2:]\n",
-    "         \n",
-    "    match = re.match(r'([A-G]#?)(.*)', chord_str)\n",
-    "    if not match: return None, None\n",
-    "    \n",
-    "    root = match.group(1)\n",
-    "    rest = match.group(2).lower()\n",
-    "    \n",
-    "    # Characteristic\n",
-    "    if 'dim' in rest: quality = 'dim'\n",
-    "    elif 'min' in rest or 'm' == rest: quality = 'min'\n",
-    "    elif '7' in rest and 'maj' not in rest and 'min' not in rest: quality = '7'\n",
-    "    else: quality = 'maj' # Default to major triad/major seventh chord\n",
-    "    \n",
-    "    return root, quality"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "4cc94eec-4f9d-43dd-ac98-5a852ad494c7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# ==========================================\n",
-    "# Segmentation\n",
-    "# ==========================================\n",
-    "\n",
-    "def segment_music_string(raw_string):\n",
-    "    \"\"\"\n",
-    "    Input: '<intro_1> C D <verse_1> F G'\n",
-    "    Output: [{'label': 'intro_1', 'chords': ['C', 'D']}, {'label': 'verse_1', 'chords': ['F', 'G']}]\n",
-    "    \"\"\"\n",
-    "    parts = re.split(r'(<[^>]+>)', raw_string)\n",
-    "    \n",
-    "    segments = []\n",
-    "    current_label = \"Unknown\"\n",
-    "    \n",
-    "    for part in parts:\n",
-    "        part = part.strip()\n",
-    "        if not part: continue\n",
-    "        \n",
-    "        # Label (< xxx，> )\n",
-    "        if part.startswith('<') and part.endswith('>'):\n",
-    "            current_label = part.strip('<>')\n",
-    "        else:\n",
-    "            # Content（Chords）\n",
-    "            chord_list = part.split()\n",
-    "            if chord_list:\n",
-    "                segments.append({\n",
-    "                    'label': current_label,\n",
-    "                    'chords': chord_list\n",
-    "                })\n",
-    "                \n",
-    "    return segments"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "id": "f94b2dca-8469-4ec3-bcf3-b4a7fd0886ce",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# ==========================================\n",
-    "# Key Inference\n",
-    "# ==========================================\n",
-    "\n",
-    "KEY_TEMPLATES = {}\n",
-    "def _build_templates():\n",
-    "    for i in range(12):\n",
-    "        root = NOTES[i]\n",
-    "        # Major key template\n",
-    "        s_maj = [NOTES[(i + n) % 12] for n in [0, 2, 4, 5, 7, 9, 11]]\n",
-    "        maj_chords = {s_maj[0]:['maj'], s_maj[1]:['min'], s_maj[2]:['min'], s_maj[3]:['maj'], s_maj[4]:['maj','7'], s_maj[5]:['min'], s_maj[6]:['dim']}\n",
-    "        KEY_TEMPLATES[f\"{root} Major\"] = (maj_chords, s_maj[0], s_maj[4]) # (Template, main tone, subordinate tone)\n",
-    "        \n",
-    "        # Minor Tune Template (Natural+Harmony)\n",
-    "        s_min = [NOTES[(i + n) % 12] for n in [0, 2, 3, 5, 7, 8, 10]]\n",
-    "        dom_root = NOTES[(i + 7) % 12]\n",
-    "        min_chords = {s_min[0]:['min'], s_min[1]:['dim'], s_min[2]:['maj'], s_min[3]:['min'], s_min[4]:['min'], dom_root:['maj','7'], s_min[5]:['maj'], s_min[6]:['maj','7']}\n",
-    "        KEY_TEMPLATES[f\"{root} Minor\"] = (min_chords, s_min[0], dom_root)\n",
-    "_build_templates()\n",
-    "\n",
-    "def infer_key_from_list(chord_list):\n",
-    "    \"\"\"\n",
-    "    input: ['C', 'F', 'G7', 'C']\n",
-    "    output: 'C Major'\n",
-    "    \"\"\"\n",
-    "    if not chord_list: return \"Unknown\"\n",
-    "    \n",
-    "    parsed_data = []\n",
-    "    for idx, c_str in enumerate(chord_list):\n",
-    "        r, q = parse_chord_root_quality(c_str)\n",
-    "        if r:\n",
-    "            next_r = None\n",
-    "            if idx + 1 < len(chord_list):\n",
-    "                next_r, _ = parse_chord_root_quality(chord_list[idx+1])\n",
-    "            parsed_data.append((r, q, next_r))\n",
-    "            \n",
-    "    scores = defaultdict(int)\n",
-    "    \n",
-    "    # calculate the score for each\n",
-    "    for key_name, (template, tonic, dom) in KEY_TEMPLATES.items():\n",
-    "        score = 0\n",
-    "        for root, quality, next_root in parsed_data:\n",
-    "            # Basic mathc\n",
-    "            if root in template:\n",
-    "                score += 1\n",
-    "                if quality in template[root]:\n",
-    "                    score += 2\n",
-    "            else:\n",
-    "                score -= 1\n",
-    "            \n",
-    "            if root == dom and quality == '7' and next_root == tonic:\n",
-    "                score += 5\n",
-    "                \n",
-    "            if root == tonic:\n",
-    "                score += 1\n",
-    "                \n",
-    "        scores[key_name] = score\n",
-    "        \n",
-    "    if not scores: return \"Unknown\"\n",
-    "    return max(scores, key=scores.get)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "id": "d16ad9ab-ebe1-4cfc-86f5-0c38ece01323",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# ==========================================\n",
-    "# Transpose the chords to Roman numerals\n",
-    "# ==========================================\n",
-    "\n",
-    "ROMAN_MAP = {\n",
-    "    0: 'I', 1: 'bII', 2: 'II', 3: 'bIII', 4: 'III', 5: 'IV', \n",
-    "    6: 'bV', 7: 'V', 8: 'bVI', 9: 'VI', 10: 'bVII', 11: 'VII'\n",
-    "}\n",
-    "\n",
-    "def convert_to_roman(chord_list, key_str):\n",
-    "    \"\"\"\n",
-    "    input: chord_list=['C', 'F', 'G'], key_str='C Major'\n",
-    "    output: ['I', 'IV', 'V']\n",
-    "    \"\"\"\n",
-    "    if key_str == \"Unknown\":\n",
-    "        return chord_list\n",
-    "    \n",
-    "    # Main tone\n",
-    "    key_root_str = key_str.split()[0] # \"C Major\" -> \"C\"\n",
-    "    is_minor_key = \"Minor\" in key_str\n",
-    "    \n",
-    "    if key_root_str not in NOTES: return chord_list\n",
-    "    key_root_idx = NOTES.index(key_root_str)\n",
-    "    \n",
-    "    roman_output = []\n",
-    "    \n",
-    "    for chord_str in chord_list:\n",
-    "        root, quality = parse_chord_root_quality(chord_str)\n",
-    "        if not root:\n",
-    "            roman_output.append(\"?\")\n",
-    "            continue\n",
-    "            \n",
-    "        # Calculate interval  (Root note of chord - tonic) % 12\n",
-    "        root_idx = NOTES.index(root)\n",
-    "        interval = (root_idx - key_root_idx) % 12\n",
-    "        \n",
-    "        base_roman = ROMAN_MAP.get(interval, \"?\")\n",
-    "        \n",
-    "        # min or dim -> lowercast\n",
-    "        if quality == 'min' or quality == 'dim':\n",
-    "            final_roman = base_roman.lower()\n",
-    "        else:\n",
-    "            final_roman = base_roman\n",
-    "            \n",
-    "            \n",
-    "        # suffix\n",
-    "        if quality == '7':\n",
-    "            final_roman += '7'\n",
-    "        elif quality == 'dim':\n",
-    "            final_roman += '°'\n",
-    "            \n",
-    "        roman_output.append(final_roman)\n",
-    "        \n",
-    "    return roman_output"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "id": "681f297a-1520-4f5f-bf33-7a9af7b5ecf8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# ==========================================\n",
-    "# Fix short chords progression\n",
-    "# ==========================================\n",
-    "\n",
-    "def patch_short_segments(analyzed_segments, min_chords=2):\n",
-    "    \"\"\"\n",
-    "    Parameters:\n",
-    "    analyzed_segments: list, {'label':..., 'chords':..., 'key':...}\n",
-    "    min_chords: threshold，chords less than this quantity will be corrected (default is 2)\n",
-    "    \"\"\"\n",
-    "    \n",
-    "    # Scan twice\n",
-    "    \n",
-    "    # orward Pass\n",
-    "    for i in range(1, len(analyzed_segments)):\n",
-    "        current_seg = analyzed_segments[i]\n",
-    "        prev_seg = analyzed_segments[i-1]\n",
-    "        \n",
-    "        if len(current_seg['chords']) < min_chords:\n",
-    "            if prev_seg['key'] != \"Unknown\":\n",
-    "                current_seg['key'] = prev_seg['key']\n",
-    "                current_seg['key_source'] = 'borrowed_prev' \n",
-    "\n",
-    "    # Backward Pass\n",
-    "    for i in range(len(analyzed_segments) - 2, -1, -1):\n",
-    "        current_seg = analyzed_segments[i]\n",
-    "        next_seg = analyzed_segments[i+1]\n",
-    "        \n",
-    "        if len(current_seg['chords']) < min_chords:\n",
-    "            # (Usually Intro should follow Verse's tone)\n",
-    "            if next_seg['key'] != \"Unknown\":\n",
-    "                 current_seg['key'] = next_seg['key']\n",
-    "                 current_seg['key_source'] = 'borrowed_next'\n",
-    "                 \n",
-    "    return analyzed_segments"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "id": "96efd397-4835-41db-9692-73a86d371f2d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# ==========================================\n",
-    "# Combine Roman numerals\n",
-    "# ==========================================\n",
-    "\n",
-    "def extract_and_format_roman(analyzed_records, include_tags=True):\n",
-    "    \"\"\"\n",
-    "    Parameters:\n",
-    "    analyzed_records: chords_output in the former step\n",
-    "    include_tags: \n",
-    "                  - True: Keep \"<intro> I IV <verse> I V\" (Structure + Sequence)\n",
-    "                  - False: Only \"I IV I V\" (Sequence)\n",
-    "    \n",
-    "    Return:\n",
-    "    str: Roman numerals string\n",
-    "    \"\"\"\n",
-    "    output_parts = []\n",
-    "    \n",
-    "    for record in analyzed_records:\n",
-    "        roman_seq = \" \".join(record['roman'])\n",
-    "        \n",
-    "        if include_tags:\n",
-    "            # \"<label> roman_seq\"\n",
-    "            segment_str = f\"<{record['section']}> {roman_seq}\"\n",
-    "            output_parts.append(segment_str)\n",
-    "        else:\n",
-    "            # Roman numerals\n",
-    "            output_parts.append(roman_seq)\n",
-    "            \n",
-    "    return \" \".join(output_parts)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b4eac0b6-0a72-4a77-81bc-8375ad786819",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "id": "fffce437-c1ca-48a4-853b-b570aadf3de5",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'<intro_1> C <verse_1> F C E7 Amin C F C G7 C F C E7 Amin C F G7 C <verse_2> F C E7 Amin C F C G7 C F C E7 Amin C F G7 C <chorus_1> F C F C G C F C E7 Amin C F G7 C <solo_1> D <chorus_2> G D G D A D G D Fs7 Bmin D G A7 D G A7 D'"
-      ]
-     },
-     "execution_count": 17,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df['chords'][0]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "id": "9e18b5cd-41d7-4c77-a467-8ebcbbc4de6c",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[{'label': 'intro_1', 'chords': ['C']},\n",
-       " {'label': 'verse_1',\n",
-       "  'chords': ['F',\n",
-       "   'C',\n",
-       "   'E7',\n",
-       "   'Amin',\n",
-       "   'C',\n",
-       "   'F',\n",
-       "   'C',\n",
-       "   'G7',\n",
-       "   'C',\n",
-       "   'F',\n",
-       "   'C',\n",
-       "   'E7',\n",
-       "   'Amin',\n",
-       "   'C',\n",
-       "   'F',\n",
-       "   'G7',\n",
-       "   'C']},\n",
-       " {'label': 'verse_2',\n",
-       "  'chords': ['F',\n",
-       "   'C',\n",
-       "   'E7',\n",
-       "   'Amin',\n",
-       "   'C',\n",
-       "   'F',\n",
-       "   'C',\n",
-       "   'G7',\n",
-       "   'C',\n",
-       "   'F',\n",
-       "   'C',\n",
-       "   'E7',\n",
-       "   'Amin',\n",
-       "   'C',\n",
-       "   'F',\n",
-       "   'G7',\n",
-       "   'C']},\n",
-       " {'label': 'chorus_1',\n",
-       "  'chords': ['F',\n",
-       "   'C',\n",
-       "   'F',\n",
-       "   'C',\n",
-       "   'G',\n",
-       "   'C',\n",
-       "   'F',\n",
-       "   'C',\n",
-       "   'E7',\n",
-       "   'Amin',\n",
-       "   'C',\n",
-       "   'F',\n",
-       "   'G7',\n",
-       "   'C']},\n",
-       " {'label': 'solo_1', 'chords': ['D']},\n",
-       " {'label': 'chorus_2',\n",
-       "  'chords': ['G',\n",
-       "   'D',\n",
-       "   'G',\n",
-       "   'D',\n",
-       "   'A',\n",
-       "   'D',\n",
-       "   'G',\n",
-       "   'D',\n",
-       "   'Fs7',\n",
-       "   'Bmin',\n",
-       "   'D',\n",
-       "   'G',\n",
-       "   'A7',\n",
-       "   'D',\n",
-       "   'G',\n",
-       "   'A7',\n",
-       "   'D']}]"
-      ]
-     },
-     "execution_count": 18,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Segmentation\n",
-    "segments = segment_music_string(df['chords'][0])\n",
-    "segments"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "id": "96fa6a65-c63a-4a40-9d4c-b27d1e4f96ed",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[{'section': 'intro_1', 'chords': ['C'], 'key': 'C Major'},\n",
-       " {'section': 'verse_1',\n",
-       "  'chords': ['F',\n",
-       "   'C',\n",
-       "   'E7',\n",
-       "   'Amin',\n",
-       "   'C',\n",
-       "   'F',\n",
-       "   'C',\n",
-       "   'G7',\n",
-       "   'C',\n",
-       "   'F',\n",
-       "   'C',\n",
-       "   'E7',\n",
-       "   'Amin',\n",
-       "   'C',\n",
-       "   'F',\n",
-       "   'G7',\n",
-       "   'C'],\n",
-       "  'key': 'C Major'},\n",
-       " {'section': 'verse_2',\n",
-       "  'chords': ['F',\n",
-       "   'C',\n",
-       "   'E7',\n",
-       "   'Amin',\n",
-       "   'C',\n",
-       "   'F',\n",
-       "   'C',\n",
-       "   'G7',\n",
-       "   'C',\n",
-       "   'F',\n",
-       "   'C',\n",
-       "   'E7',\n",
-       "   'Amin',\n",
-       "   'C',\n",
-       "   'F',\n",
-       "   'G7',\n",
-       "   'C'],\n",
-       "  'key': 'C Major'},\n",
-       " {'section': 'chorus_1',\n",
-       "  'chords': ['F',\n",
-       "   'C',\n",
-       "   'F',\n",
-       "   'C',\n",
-       "   'G',\n",
-       "   'C',\n",
-       "   'F',\n",
-       "   'C',\n",
-       "   'E7',\n",
-       "   'Amin',\n",
-       "   'C',\n",
-       "   'F',\n",
-       "   'G7',\n",
-       "   'C'],\n",
-       "  'key': 'C Major'},\n",
-       " {'section': 'solo_1', 'chords': ['D'], 'key': 'D Major'},\n",
-       " {'section': 'chorus_2',\n",
-       "  'chords': ['G',\n",
-       "   'D',\n",
-       "   'G',\n",
-       "   'D',\n",
-       "   'A',\n",
-       "   'D',\n",
-       "   'G',\n",
-       "   'D',\n",
-       "   'Fs7',\n",
-       "   'Bmin',\n",
-       "   'D',\n",
-       "   'G',\n",
-       "   'A7',\n",
-       "   'D',\n",
-       "   'G',\n",
-       "   'A7',\n",
-       "   'D'],\n",
-       "  'key': 'D Major'}]"
-      ]
-     },
-     "execution_count": 19,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Infer the key\n",
-    "analyzed_records = []\n",
-    "for seg in segments:\n",
-    "    inferred_key = infer_key_from_list(seg['chords'])\n",
-    "    analyzed_records.append({\n",
-    "        'section': seg['label'],\n",
-    "        'chords': seg['chords'],\n",
-    "        'key': inferred_key\n",
-    "    })\n",
-    "\n",
-    "analyzed_records"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "id": "ab0473b1-cbc5-4160-88f5-ac02d2de0ed5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Fix short patch\n",
-    "analyzed_records = patch_short_segments(analyzed_records, min_chords=2)\n",
-    "\n",
-    "# Transpost to Roman numerals\n",
-    "for record in analyzed_records:\n",
-    "    record['roman'] = convert_to_roman(record['chords'], record['key'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "id": "e008b3e3-5599-405c-b13b-1e7618bd6464",
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'section': 'chorus_2',\n",
-       " 'chords': ['G',\n",
-       "  'D',\n",
-       "  'G',\n",
-       "  'D',\n",
-       "  'A',\n",
-       "  'D',\n",
-       "  'G',\n",
-       "  'D',\n",
-       "  'Fs7',\n",
-       "  'Bmin',\n",
-       "  'D',\n",
-       "  'G',\n",
-       "  'A7',\n",
-       "  'D',\n",
-       "  'G',\n",
-       "  'A7',\n",
-       "  'D'],\n",
-       " 'key': 'D Major',\n",
-       " 'roman': ['IV',\n",
-       "  'I',\n",
-       "  'IV',\n",
-       "  'I',\n",
-       "  'V',\n",
-       "  'I',\n",
-       "  'IV',\n",
-       "  'I',\n",
-       "  'III7',\n",
-       "  'vi',\n",
-       "  'I',\n",
-       "  'IV',\n",
-       "  'V7',\n",
-       "  'I',\n",
-       "  'IV',\n",
-       "  'V7',\n",
-       "  'I']}"
-      ]
-     },
-     "execution_count": 21,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "record"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "id": "b6deceeb-6964-46b0-ab27-45b6ca58192f",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'I IV I III7 vi I IV I V7 I IV I III7 vi I IV V7 I IV I III7 vi I IV I V7 I IV I III7 vi I IV V7 I IV I IV I V I IV I III7 vi I IV V7 I I IV I IV I V I IV I III7 vi I IV V7 I IV V7 I'"
-      ]
-     },
-     "execution_count": 22,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "roman_numerals_record = extract_and_format_roman(analyzed_records, include_tags=False)\n",
-    "roman_numerals_record"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1f51fc7b-718d-4ce6-b35b-c93c8148dab4",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "id": "7feb8982-308d-467d-b737-b891517ab37c",
-   "metadata": {},
-   "source": [
-    "Coarse graining for Roman Numerals"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "id": "795661aa-e36a-40b3-8d4e-6bdd5e216566",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class RomanNormalizer:\n",
-    "    def __init__(self, strategy='core'):\n",
-    "        \"\"\"\n",
-    "        strategy:\n",
-    "          - 'core': Keep only I, i, V7, dim\n",
-    "          - 'standard': Add sus, maj7, m7\n",
-    "          - 'dynamic': Keep all details\n",
-    "        \"\"\"\n",
-    "        self.strategy = strategy\n",
-    "\n",
-    "    def normalize(self, raw_roman):\n",
-    "        if not isinstance(raw_roman, str) or raw_roman in ['?', 'Unknown', '<ERROR>']:\n",
-    "            return None\n",
-    "\n",
-    "        # Disassemble (prefix core suffix)\n",
-    "        match = re.match(r'^([b#]*)([ivIV]+)(.*)$', raw_roman)\n",
-    "        if not match: return None\n",
-    "        \n",
-    "        prefix, core, suffix = match.groups()\n",
-    "        is_lower = core.islower()\n",
-    "\n",
-    "        # Reorganize strings based on strategy\n",
-    "        if self.strategy == 'dynamic':\n",
-    "            return raw_roman\n",
-    "            \n",
-    "        elif self.strategy == 'standard':\n",
-    "            new_suffix = \"\"\n",
-    "            if 'sus4' in suffix: new_suffix = 'sus4'\n",
-    "            elif 'sus2' in suffix: new_suffix = 'sus2'\n",
-    "            elif 'maj7' in suffix or 'M7' in suffix: new_suffix = 'maj7'\n",
-    "            elif 'ø' in suffix: new_suffix = 'm7b5'\n",
-    "            elif 'dim7' in suffix: new_suffix = 'dim7'\n",
-    "            elif 'dim' in suffix or '°' in suffix: new_suffix = 'dim'\n",
-    "            elif '7' in suffix: new_suffix = '7' # 保留 V7 或 m7\n",
-    "            return f\"{prefix}{core}{new_suffix}\"\n",
-    "\n",
-    "        elif self.strategy == 'core':\n",
-    "            new_suffix = \"\"\n",
-    "            # Two situations keeping suffix:\n",
-    "            # 1. dominant 7th\n",
-    "            if not is_lower and '7' in suffix and 'maj' not in suffix:\n",
-    "                new_suffix = '7'\n",
-    "            # 2. diminish\n",
-    "            elif 'dim' in suffix or '°' in suffix:\n",
-    "                new_suffix = 'dim'\n",
-    "            # others\n",
-    "            return f\"{prefix}{core}{new_suffix}\"\n",
-    "            \n",
-    "        return raw_roman"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "id": "99ba921d-087f-4d85-9d94-5ee7e99f8d72",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def normalize_sequence(roman_seq_str, normalizer):\n",
-    "    \"\"\"\n",
-    "    input: \"<intro> I bVIsus4 V7\"\n",
-    "    output: \"<intro> I bVI V7\" (core strategy)\n",
-    "    \"\"\"\n",
-    "    if not isinstance(roman_seq_str, str):\n",
-    "        return \"\"\n",
-    "    \n",
-    "    tokens = roman_seq_str.split()\n",
-    "    \n",
-    "    normalized_tokens = []\n",
-    "    for token in tokens:\n",
-    "        # if label (<intro>), keep it\n",
-    "        if token.startswith('<') and token.endswith('>'):\n",
-    "            normalized_tokens.append(token)\n",
-    "        else:\n",
-    "            norm = normalizer.normalize(token)\n",
-    "            if norm:\n",
-    "                normalized_tokens.append(norm)\n",
-    "    \n",
-    "    # append to a string list\n",
-    "    return \" \".join(normalized_tokens)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "id": "04d0d98e-d7ec-4687-922d-1ea0c253b68d",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "I IV I III7 vi I IV I V7 I IV I III7 vi I IV V7 I IV I III7 vi I IV I V7 I IV I III7 vi I IV V7 I IV I IV I V I IV I III7 vi I IV V7 I I IV I IV I V I IV I III7 vi I IV V7 I IV V7 I\n"
-     ]
-    }
-   ],
-   "source": [
-    "normalizer = RomanNormalizer(strategy='core')\n",
-    "\n",
-    "normalized_roman = normalize_sequence(roman_numerals_record, normalizer)\n",
-    "print(normalized_roman) "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7ee0d24f-691c-4183-890d-64fe33fb0a7d",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "id": "e057a68f-db0d-4753-a800-96ac255d95ca",
-   "metadata": {},
-   "source": [
-    "Tool: Roman Numerals to scales (Used as a dictionary)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 41,
-   "id": "9866344c-1635-4210-ab52-273b47da3af1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def map_roman_to_pitches(roman_str):\n",
-    "    \n",
-    "    # 【修正1】如果输入为空，返回 ([], None) 而不是 []\n",
-    "    if not roman_str: \n",
-    "        return [], None\n",
-    "\n",
-    "    ROMAN_OFFSETS = {'i': 0, 'ii': 2, 'iii': 4, 'iv': 5, 'v': 7, 'vi': 9, 'vii': 11}\n",
-    "    \n",
-    "    INTERVALS = {\n",
-    "        'maj': [0, 4, 7], 'min': [0, 3, 7], 'dim': [0, 3, 6],\n",
-    "        'dom7': [0, 4, 7, 10], 'maj7': [0, 4, 7, 11], 'min7': [0, 3, 7, 10],\n",
-    "        'dim7': [0, 3, 6, 9], 'm7b5': [0, 3, 6, 10],\n",
-    "        'sus4': [0, 5, 7], 'sus2': [0, 2, 7]\n",
-    "    }\n",
-    "    \n",
-    "    EXTENSIONS = {'9': 2, '11': 5, '13': 9, 'b9': 1, '#9': 3}\n",
-    "\n",
-    "    # Analyze\n",
-    "    match = re.match(r'^([b#]*)([ivIV]+)(.*)$', roman_str)\n",
-    "    \n",
-    "    # 【修正2】如果正则匹配失败，返回 ([], None) 而不是 []\n",
-    "    if not match: \n",
-    "        return [], None\n",
-    "        \n",
-    "    prefix, core, suffix = match.groups()\n",
-    "\n",
-    "    # Root\n",
-    "    base = ROMAN_OFFSETS.get(core.lower())\n",
-    "    \n",
-    "    # 【修正3】如果找不到根音，返回 ([], None) 而不是 []\n",
-    "    if base is None: \n",
-    "        return [], None\n",
-    "        \n",
-    "    acc = -1 if prefix=='b' else (1 if prefix=='#' else 0)\n",
-    "    root = (base + acc) % 12\n",
-    "\n",
-    "    # Basic interval\n",
-    "    is_lower = core.islower()\n",
-    "\n",
-    "    intervals = []\n",
-    "    if 'sus4' in suffix: intervals = INTERVALS['sus4']\n",
-    "    elif 'sus2' in suffix: intervals = INTERVALS['sus2']\n",
-    "    elif 'maj7' in suffix: intervals = INTERVALS['maj7']\n",
-    "    elif 'dim7' in suffix: intervals = INTERVALS['dim7']\n",
-    "    elif 'dim' in suffix: intervals = INTERVALS['dim']\n",
-    "    elif '7' in suffix: intervals = INTERVALS['min7'] if is_lower else INTERVALS['dom7']\n",
-    "    else: intervals = INTERVALS['min'] if is_lower else INTERVALS['maj']\n",
-    "    \n",
-    "    current_intervals = list(intervals)\n",
-    "\n",
-    "    # Deal with extentions\n",
-    "    for ext, val in EXTENSIONS.items():\n",
-    "        if ext in suffix and val not in current_intervals:\n",
-    "            current_intervals.append(val)\n",
-    "\n",
-    "    # (Root + Interval) % 12\n",
-    "    pitches = sorted(list(set([(root + i) % 12 for i in current_intervals])))\n",
-    "    \n",
-    "    vector = np.zeros(12, dtype=int)\n",
-    "    vector[pitches] = 1\n",
-    "    \n",
-    "    return pitches, vector"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "6c61ddf4-01d6-4ce3-9410-8682be06cb8b",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "id": "25339757-c287-442c-8761-ebd4b734d490",
-   "metadata": {},
-   "source": [
-    "Main"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 42,
-   "id": "2df6fdf1-743e-4d4b-8018-2daeb64926de",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tqdm.pandas(desc=\"Processing\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 37,
-   "id": "1e3f91d3-f0e7-48ff-aad7-1f31feef22b2",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# ==========================================\n",
-    "# Roman Numeral Column\n",
-    "# ==========================================\n",
-    "def generate_roman_seq(raw_chord_str):\n",
-    "    \n",
-    "    if not isinstance(raw_chord_str, str) or not raw_chord_str:\n",
-    "        return \"\"\n",
-    "\n",
-    "    try:\n",
-    "        segments = segment_music_string(raw_chord_str)\n",
-    "        if not segments:\n",
-    "            return \"\"\n",
-    "\n",
-    "        analyzed_records = []\n",
-    "        for seg in segments:\n",
-    "            inferred_key = infer_key_from_list(seg['chords'])\n",
-    "            analyzed_records.append({\n",
-    "                'section': seg['label'],\n",
-    "                'chords': seg['chords'],\n",
-    "                'key': inferred_key\n",
-    "            })\n",
-    "\n",
-    "        analyzed_records = patch_short_segments(analyzed_records, min_chords=2)\n",
-    "\n",
-    "        for record in analyzed_records:\n",
-    "            record['roman'] = convert_to_roman(record['chords'], record['key'])\n",
-    "\n",
-    "        return extract_and_format_roman(analyzed_records, include_tags=True)\n",
-    "\n",
-    "    except Exception as e:\n",
-    "        return \"\"\n",
-    "\n",
-    "# ==========================================\n",
-    "# One-Hot Matrix Column\n",
-    "# ==========================================\n",
-    "def generate_chroma_matrix(roman_seq_str):\n",
-    "\n",
-    "    if not isinstance(roman_seq_str, str) or not roman_seq_str:\n",
-    "        return np.array([])\n",
-    "\n",
-    "    # explicit string\n",
-    "    tokens = roman_seq_str.split()\n",
-    "    \n",
-    "    matrix_rows = []\n",
-    "    \n",
-    "    for token in tokens:\n",
-    "        # label filter\n",
-    "        if token.startswith('<') and token.endswith('>'):\n",
-    "            continue\n",
-    "            \n",
-    "        _, vector = map_roman_to_pitches(token)\n",
-    "        \n",
-    "        # check vector\n",
-    "        if vector is not None:\n",
-    "            matrix_rows.append(vector)\n",
-    "            \n",
-    "    if not matrix_rows:\n",
-    "        return np.array([])\n",
-    "\n",
-    "    return np.vstack(matrix_rows)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f080f918-35de-4543-9c66-287c94ce3019",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(\"Step 1: Generating Roman Numeral sequences...\")\n",
-    "df['roman_seq'] = df['chords_simplified'].progress_apply(generate_roman_seq)\n",
-    "\n",
-    "print(\"Finish Step 1!\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 40,
-   "id": "0ce79ed3-9858-48fa-ab7b-97fa478f1d00",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>id</th>\n",
-       "      <th>chords</th>\n",
-       "      <th>release_date</th>\n",
-       "      <th>genres</th>\n",
-       "      <th>decade</th>\n",
-       "      <th>rock_genre</th>\n",
-       "      <th>artist_id</th>\n",
-       "      <th>main_genre</th>\n",
-       "      <th>spotify_song_id</th>\n",
-       "      <th>spotify_artist_id</th>\n",
-       "      <th>chords_simplified</th>\n",
-       "      <th>roman_seq</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>1</td>\n",
-       "      <td>&lt;intro_1&gt; C &lt;verse_1&gt; F C E7 Amin C F C G7 C F...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>'classic country pop'</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>artist_1</td>\n",
-       "      <td>pop</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>4AIEGdwDzPELXYgM5JaEY5</td>\n",
-       "      <td>&lt;intro_1&gt; C &lt;verse_1&gt; F C E7 Amin C F C G7 C F...</td>\n",
-       "      <td>&lt;intro_1&gt; I &lt;verse_1&gt; IV I III7 vi I IV I V7 I...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>2</td>\n",
-       "      <td>&lt;intro_1&gt; E D A/Cs E D A/Cs &lt;verse_1&gt; E D A/Cs...</td>\n",
-       "      <td>2003-01-01</td>\n",
-       "      <td>'alternative metal' 'alternative rock' 'nu met...</td>\n",
-       "      <td>2000.0</td>\n",
-       "      <td>pop rock</td>\n",
-       "      <td>artist_2</td>\n",
-       "      <td>metal</td>\n",
-       "      <td>2ffJZ2r8HxI5DHcmf3BO6c</td>\n",
-       "      <td>694QW15WkebjcrWgQHzRYF</td>\n",
-       "      <td>&lt;intro_1&gt; E D A E D A &lt;verse_1&gt; E D A E D A E ...</td>\n",
-       "      <td>&lt;intro_1&gt; V IV I V IV I &lt;verse_1&gt; V IV I V IV ...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>3</td>\n",
-       "      <td>&lt;intro_1&gt; Csmin &lt;verse_1&gt; A Csmin A Csmin A Cs...</td>\n",
-       "      <td>2003-01-01</td>\n",
-       "      <td>'alternative metal' 'canadian rock' 'funk meta...</td>\n",
-       "      <td>2000.0</td>\n",
-       "      <td>canadian rock</td>\n",
-       "      <td>artist_3</td>\n",
-       "      <td>metal</td>\n",
-       "      <td>5KiY8SZEnvCPyIEkFGRR3y</td>\n",
-       "      <td>0niJkG4tKkne3zwr7I8n9n</td>\n",
-       "      <td>&lt;intro_1&gt; Csmin &lt;verse_1&gt; A Csmin A Csmin A Cs...</td>\n",
-       "      <td>&lt;intro_1&gt; i &lt;verse_1&gt; bVI i bVI i bVI i bVI bV...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>4</td>\n",
-       "      <td>&lt;intro_1&gt; D Dmaj7 D Dmaj7 &lt;verse_1&gt; Emin A D G...</td>\n",
-       "      <td>2022-09-23</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>2020.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>artist_4</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>01TtAcUqyLCRBZq4ZZiQWS</td>\n",
-       "      <td>17BfKBemmMGO5ZAK25wraW</td>\n",
-       "      <td>&lt;intro_1&gt; D Dmaj7 D Dmaj7 &lt;verse_1&gt; Emin A D G...</td>\n",
-       "      <td>&lt;intro_1&gt; I I I I &lt;verse_1&gt; ii V I IV ii V I I...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>5</td>\n",
-       "      <td>&lt;intro_1&gt; C &lt;verse_1&gt; G C G C &lt;chorus_1&gt; F Dmi...</td>\n",
-       "      <td>2023-02-10</td>\n",
-       "      <td>'modern country pop'</td>\n",
-       "      <td>2020.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>artist_5</td>\n",
-       "      <td>pop</td>\n",
-       "      <td>3zUecdrWC3IqrNSjhnoF3G</td>\n",
-       "      <td>4GGfAshSkqoxpZdoaHm7ky</td>\n",
-       "      <td>&lt;intro_1&gt; C &lt;verse_1&gt; G C G C &lt;chorus_1&gt; F Dmi...</td>\n",
-       "      <td>&lt;intro_1&gt; I &lt;verse_1&gt; V I V I &lt;chorus_1&gt; IV ii...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   id                                             chords release_date  \\\n",
-       "0   1  <intro_1> C <verse_1> F C E7 Amin C F C G7 C F...          NaN   \n",
-       "1   2  <intro_1> E D A/Cs E D A/Cs <verse_1> E D A/Cs...   2003-01-01   \n",
-       "2   3  <intro_1> Csmin <verse_1> A Csmin A Csmin A Cs...   2003-01-01   \n",
-       "3   4  <intro_1> D Dmaj7 D Dmaj7 <verse_1> Emin A D G...   2022-09-23   \n",
-       "4   5  <intro_1> C <verse_1> G C G C <chorus_1> F Dmi...   2023-02-10   \n",
-       "\n",
-       "                                              genres  decade     rock_genre  \\\n",
-       "0                              'classic country pop'     NaN            NaN   \n",
-       "1  'alternative metal' 'alternative rock' 'nu met...  2000.0       pop rock   \n",
-       "2  'alternative metal' 'canadian rock' 'funk meta...  2000.0  canadian rock   \n",
-       "3                                                NaN  2020.0            NaN   \n",
-       "4                               'modern country pop'  2020.0            NaN   \n",
-       "\n",
-       "  artist_id main_genre         spotify_song_id       spotify_artist_id  \\\n",
-       "0  artist_1        pop                     NaN  4AIEGdwDzPELXYgM5JaEY5   \n",
-       "1  artist_2      metal  2ffJZ2r8HxI5DHcmf3BO6c  694QW15WkebjcrWgQHzRYF   \n",
-       "2  artist_3      metal  5KiY8SZEnvCPyIEkFGRR3y  0niJkG4tKkne3zwr7I8n9n   \n",
-       "3  artist_4        NaN  01TtAcUqyLCRBZq4ZZiQWS  17BfKBemmMGO5ZAK25wraW   \n",
-       "4  artist_5        pop  3zUecdrWC3IqrNSjhnoF3G  4GGfAshSkqoxpZdoaHm7ky   \n",
-       "\n",
-       "                                   chords_simplified  \\\n",
-       "0  <intro_1> C <verse_1> F C E7 Amin C F C G7 C F...   \n",
-       "1  <intro_1> E D A E D A <verse_1> E D A E D A E ...   \n",
-       "2  <intro_1> Csmin <verse_1> A Csmin A Csmin A Cs...   \n",
-       "3  <intro_1> D Dmaj7 D Dmaj7 <verse_1> Emin A D G...   \n",
-       "4  <intro_1> C <verse_1> G C G C <chorus_1> F Dmi...   \n",
-       "\n",
-       "                                           roman_seq  \n",
-       "0  <intro_1> I <verse_1> IV I III7 vi I IV I V7 I...  \n",
-       "1  <intro_1> V IV I V IV I <verse_1> V IV I V IV ...  \n",
-       "2  <intro_1> i <verse_1> bVI i bVI i bVI i bVI bV...  \n",
-       "3  <intro_1> I I I I <verse_1> ii V I IV ii V I I...  \n",
-       "4  <intro_1> I <verse_1> V I V I <chorus_1> IV ii...  "
-      ]
-     },
-     "execution_count": 40,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 43,
-   "id": "df151696-e585-475d-9964-a952125f2bd3",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Step 2: Generating One-Hot Chroma Matrices...\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Processing: 100%|████████████████████████████████████████████████████████████| 679807/679807 [07:58<00:00, 1420.14it/s]\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(\"Step 2: Generating One-Hot Chroma Matrices...\")\n",
-    "df['chroma_matrix'] = df['roman_seq'].progress_apply(generate_chroma_matrix)\n",
-    "\n",
-    "print(\"Finish Step 2!\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 44,
-   "id": "f62af199-6dc3-46ec-a9c5-e3649bca484a",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>id</th>\n",
-       "      <th>chords</th>\n",
-       "      <th>release_date</th>\n",
-       "      <th>genres</th>\n",
-       "      <th>decade</th>\n",
-       "      <th>rock_genre</th>\n",
-       "      <th>artist_id</th>\n",
-       "      <th>main_genre</th>\n",
-       "      <th>spotify_song_id</th>\n",
-       "      <th>spotify_artist_id</th>\n",
-       "      <th>chords_simplified</th>\n",
-       "      <th>roman_seq</th>\n",
-       "      <th>chroma_matrix</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>1</td>\n",
-       "      <td>&lt;intro_1&gt; C &lt;verse_1&gt; F C E7 Amin C F C G7 C F...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>'classic country pop'</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>artist_1</td>\n",
-       "      <td>pop</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>4AIEGdwDzPELXYgM5JaEY5</td>\n",
-       "      <td>&lt;intro_1&gt; C &lt;verse_1&gt; F C E7 Amin C F C G7 C F...</td>\n",
-       "      <td>&lt;intro_1&gt; I &lt;verse_1&gt; IV I III7 vi I IV I V7 I...</td>\n",
-       "      <td>[[1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0], [1, 0, ...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>2</td>\n",
-       "      <td>&lt;intro_1&gt; E D A/Cs E D A/Cs &lt;verse_1&gt; E D A/Cs...</td>\n",
-       "      <td>2003-01-01</td>\n",
-       "      <td>'alternative metal' 'alternative rock' 'nu met...</td>\n",
-       "      <td>2000.0</td>\n",
-       "      <td>pop rock</td>\n",
-       "      <td>artist_2</td>\n",
-       "      <td>metal</td>\n",
-       "      <td>2ffJZ2r8HxI5DHcmf3BO6c</td>\n",
-       "      <td>694QW15WkebjcrWgQHzRYF</td>\n",
-       "      <td>&lt;intro_1&gt; E D A E D A &lt;verse_1&gt; E D A E D A E ...</td>\n",
-       "      <td>&lt;intro_1&gt; V IV I V IV I &lt;verse_1&gt; V IV I V IV ...</td>\n",
-       "      <td>[[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1], [1, 0, ...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>3</td>\n",
-       "      <td>&lt;intro_1&gt; Csmin &lt;verse_1&gt; A Csmin A Csmin A Cs...</td>\n",
-       "      <td>2003-01-01</td>\n",
-       "      <td>'alternative metal' 'canadian rock' 'funk meta...</td>\n",
-       "      <td>2000.0</td>\n",
-       "      <td>canadian rock</td>\n",
-       "      <td>artist_3</td>\n",
-       "      <td>metal</td>\n",
-       "      <td>5KiY8SZEnvCPyIEkFGRR3y</td>\n",
-       "      <td>0niJkG4tKkne3zwr7I8n9n</td>\n",
-       "      <td>&lt;intro_1&gt; Csmin &lt;verse_1&gt; A Csmin A Csmin A Cs...</td>\n",
-       "      <td>&lt;intro_1&gt; i &lt;verse_1&gt; bVI i bVI i bVI i bVI bV...</td>\n",
-       "      <td>[[1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0], [1, 0, ...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>4</td>\n",
-       "      <td>&lt;intro_1&gt; D Dmaj7 D Dmaj7 &lt;verse_1&gt; Emin A D G...</td>\n",
-       "      <td>2022-09-23</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>2020.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>artist_4</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>01TtAcUqyLCRBZq4ZZiQWS</td>\n",
-       "      <td>17BfKBemmMGO5ZAK25wraW</td>\n",
-       "      <td>&lt;intro_1&gt; D Dmaj7 D Dmaj7 &lt;verse_1&gt; Emin A D G...</td>\n",
-       "      <td>&lt;intro_1&gt; I I I I &lt;verse_1&gt; ii V I IV ii V I I...</td>\n",
-       "      <td>[[1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0], [1, 0, ...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>5</td>\n",
-       "      <td>&lt;intro_1&gt; C &lt;verse_1&gt; G C G C &lt;chorus_1&gt; F Dmi...</td>\n",
-       "      <td>2023-02-10</td>\n",
-       "      <td>'modern country pop'</td>\n",
-       "      <td>2020.0</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>artist_5</td>\n",
-       "      <td>pop</td>\n",
-       "      <td>3zUecdrWC3IqrNSjhnoF3G</td>\n",
-       "      <td>4GGfAshSkqoxpZdoaHm7ky</td>\n",
-       "      <td>&lt;intro_1&gt; C &lt;verse_1&gt; G C G C &lt;chorus_1&gt; F Dmi...</td>\n",
-       "      <td>&lt;intro_1&gt; I &lt;verse_1&gt; V I V I &lt;chorus_1&gt; IV ii...</td>\n",
-       "      <td>[[1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0], [0, 0, ...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   id                                             chords release_date  \\\n",
-       "0   1  <intro_1> C <verse_1> F C E7 Amin C F C G7 C F...          NaN   \n",
-       "1   2  <intro_1> E D A/Cs E D A/Cs <verse_1> E D A/Cs...   2003-01-01   \n",
-       "2   3  <intro_1> Csmin <verse_1> A Csmin A Csmin A Cs...   2003-01-01   \n",
-       "3   4  <intro_1> D Dmaj7 D Dmaj7 <verse_1> Emin A D G...   2022-09-23   \n",
-       "4   5  <intro_1> C <verse_1> G C G C <chorus_1> F Dmi...   2023-02-10   \n",
-       "\n",
-       "                                              genres  decade     rock_genre  \\\n",
-       "0                              'classic country pop'     NaN            NaN   \n",
-       "1  'alternative metal' 'alternative rock' 'nu met...  2000.0       pop rock   \n",
-       "2  'alternative metal' 'canadian rock' 'funk meta...  2000.0  canadian rock   \n",
-       "3                                                NaN  2020.0            NaN   \n",
-       "4                               'modern country pop'  2020.0            NaN   \n",
-       "\n",
-       "  artist_id main_genre         spotify_song_id       spotify_artist_id  \\\n",
-       "0  artist_1        pop                     NaN  4AIEGdwDzPELXYgM5JaEY5   \n",
-       "1  artist_2      metal  2ffJZ2r8HxI5DHcmf3BO6c  694QW15WkebjcrWgQHzRYF   \n",
-       "2  artist_3      metal  5KiY8SZEnvCPyIEkFGRR3y  0niJkG4tKkne3zwr7I8n9n   \n",
-       "3  artist_4        NaN  01TtAcUqyLCRBZq4ZZiQWS  17BfKBemmMGO5ZAK25wraW   \n",
-       "4  artist_5        pop  3zUecdrWC3IqrNSjhnoF3G  4GGfAshSkqoxpZdoaHm7ky   \n",
-       "\n",
-       "                                   chords_simplified  \\\n",
-       "0  <intro_1> C <verse_1> F C E7 Amin C F C G7 C F...   \n",
-       "1  <intro_1> E D A E D A <verse_1> E D A E D A E ...   \n",
-       "2  <intro_1> Csmin <verse_1> A Csmin A Csmin A Cs...   \n",
-       "3  <intro_1> D Dmaj7 D Dmaj7 <verse_1> Emin A D G...   \n",
-       "4  <intro_1> C <verse_1> G C G C <chorus_1> F Dmi...   \n",
-       "\n",
-       "                                           roman_seq  \\\n",
-       "0  <intro_1> I <verse_1> IV I III7 vi I IV I V7 I...   \n",
-       "1  <intro_1> V IV I V IV I <verse_1> V IV I V IV ...   \n",
-       "2  <intro_1> i <verse_1> bVI i bVI i bVI i bVI bV...   \n",
-       "3  <intro_1> I I I I <verse_1> ii V I IV ii V I I...   \n",
-       "4  <intro_1> I <verse_1> V I V I <chorus_1> IV ii...   \n",
-       "\n",
-       "                                       chroma_matrix  \n",
-       "0  [[1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0], [1, 0, ...  \n",
-       "1  [[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1], [1, 0, ...  \n",
-       "2  [[1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0], [1, 0, ...  \n",
-       "3  [[1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0], [1, 0, ...  \n",
-       "4  [[1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0], [0, 0, ...  "
-      ]
-     },
-     "execution_count": 44,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 47,
-   "id": "a2047927-ebcc-48d6-942b-e02ce6ab6665",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([[1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
-       "       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],\n",
-       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
-       "       [0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1],\n",
-       "       [1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0],\n",
-       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
-       "       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],\n",
-       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
-       "       [0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1],\n",
-       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
-       "       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],\n",
-       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
-       "       [0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1],\n",
-       "       [1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0],\n",
-       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
-       "       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],\n",
-       "       [0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1],\n",
-       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
-       "       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],\n",
-       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
-       "       [0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1],\n",
-       "       [1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0],\n",
-       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
-       "       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],\n",
-       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
-       "       [0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1],\n",
-       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
-       "       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],\n",
-       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
-       "       [0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1],\n",
-       "       [1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0],\n",
-       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
-       "       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],\n",
-       "       [0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1],\n",
-       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
-       "       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],\n",
-       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
-       "       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],\n",
-       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
-       "       [0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1],\n",
-       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
-       "       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],\n",
-       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
-       "       [0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1],\n",
-       "       [1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0],\n",
-       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
-       "       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],\n",
-       "       [0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1],\n",
-       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
-       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
-       "       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],\n",
-       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
-       "       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],\n",
-       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
-       "       [0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1],\n",
-       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
-       "       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],\n",
-       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
-       "       [0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1],\n",
-       "       [1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0],\n",
-       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
-       "       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],\n",
-       "       [0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1],\n",
-       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
-       "       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],\n",
-       "       [0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1],\n",
-       "       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0]])"
-      ]
-     },
-     "execution_count": 47,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df['chroma_matrix'][0]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 50,
-   "id": "6d084835-4b26-4921-b12d-6c16a978b7c1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# with open(\"C:/Users/liu.xuanc/Desktop/Code/TLDChallenge/test_chords_matrix.pkl\", 'wb') as f:\n",
-    "#     pickle.dump(df['chroma_matrix'][0], f)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 49,
-   "id": "76006541-0468-4dfd-a64a-0a3083280b1d",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "id": "19a2269b-be8e-4e2b-a9e3-a7d3a5c0c397",
-   "metadata": {},
-   "source": [
-    "Calculating task:\n",
-    "1. Frequency of each hyperedge\n",
-    "2. O-information (global and local)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "fd7751a2-a48e-4e05-bc46-55b6f9c6394d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def analyze_roman_statistics(roman_series, top_n=50):\n",
-    "    \"\"\"\n",
-    "    Statistics for the Roman Numeral Chord Sequence.\n",
-    "    1. Remove structure tags (e.g. <intro>)\n",
-    "    2. Count the frequency and frequency of each chord\n",
-    "    \"\"\"\n",
-    "    print(\"Processing data and counting frequencies...\")\n",
-    "    \n",
-    "    # 1. Split each string into a list of words\n",
-    "    # 2. explode() expands the list into a long column\n",
-    "    all_tokens = roman_series.str.split().explode()\n",
-    "    \n",
-    "    # 3. Filter: Keep only content that does NOT start with '<' (remove tags)\n",
-    "    # ~ means logical NOT\n",
-    "    chord_tokens = all_tokens[~all_tokens.str.startswith('<', na=False)]\n",
-    "    \n",
-    "    # 4. Statistical Frequency\n",
-    "    counts = chord_tokens.value_counts()\n",
-    "    \n",
-    "    # 5. Calculate frequency (percentage)\n",
-    "    total_count = counts.sum()\n",
-    "    frequencies = (counts / total_count) * 100\n",
-    "    \n",
-    "    # 6. Build result DataFrame\n",
-    "    stats_df = pd.DataFrame({\n",
-    "        'Count': counts,\n",
-    "        'Frequency (%)': frequencies.round(4) # Keep 4 decimal places\n",
-    "    })\n",
-    "    \n",
-    "    print(f\"=== Statistics Report ===\")\n",
-    "    print(f\"Total Chord Tokens: {total_count}\")\n",
-    "    print(f\"Unique Chord Types: {len(counts)}\")\n",
-    "    print(\"-\" * 30)\n",
-    "    print(f\"Top {top_n} Roman Numerals:\")\n",
-    "    print(stats_df.head(top_n))\n",
-    "    \n",
-    "    return stats_df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "8f76e0bd-1e21-485b-8b62-7e58713a76db",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# ==========================================\n",
-    "# Run the analysis\n",
-    "# ==========================================\n",
-    "\n",
-    "# Make sure you have executed the step to generate 'roman_seq' before this\n",
-    "# df['roman_seq'] is the column we generated in the previous step\n",
-    "roman_stats = analyze_roman_statistics(df['roman_seq'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "8b609dde-9c58-48a1-8a42-13f66bcc65df",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "roman_stats['Pitches'] = roman_stats.index.map(lambda x: map_roman_to_pitches(x)[0])\n",
-    "other_cols = [c for c in roman_stats.columns if c != 'Pitches']\n",
-    "new_order = ['Pitches'] + other_cols\n",
-    "roman_stats = roman_stats[new_order]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ab6923ee-3abf-433d-8bd6-ae4a3ed5b707",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def calculate_inclusive_statistics(stats_df):\n",
-    "    \"\"\"\n",
-    "    计算包含性频次 (Inclusive Frequency)。\n",
-    "    逻辑：如果和弦 A 的组成音是和弦 B 的子集 (A ⊆ B)，\n",
-    "          那么将 B 的出现次数加到 A 上。\n",
-    "    \"\"\"\n",
-    "    # 1. 创建副本，避免修改原始数据\n",
-    "    df_incl = stats_df.copy()\n",
-    "    \n",
-    "    # 初始化新列：包含性计数一开始等于原始计数\n",
-    "    df_incl['Inclusive_Count'] = df_incl['Count']\n",
-    "    \n",
-    "    # 2. 准备数据：将 Pitches 转换为 Python Set (集合)，极大加速比对\n",
-    "    # 字典结构: {'I': {0, 4, 7}, 'V7': {2, 5, 7, 11}, ...}\n",
-    "    chord_definitions = {\n",
-    "        label: set(pitches) \n",
-    "        for label, pitches in df_incl['Pitches'].items()\n",
-    "    }\n",
-    "    \n",
-    "    # 获取原始计数的字典，方便快速查找\n",
-    "    raw_counts = df_incl['Count'].to_dict()\n",
-    "    \n",
-    "    print(f\"正在计算 {len(chord_definitions)} 个和弦类型之间的包含关系...\")\n",
-    "    \n",
-    "    # 3. 双重循环遍历 (N * N)\n",
-    "    # 这是一个简单的矩阵遍历，对于 N=750 来说非常快\n",
-    "    \n",
-    "    # 遍历每一个“潜在的子集和弦” (Child)\n",
-    "    for child_label, child_notes in tqdm(chord_definitions.items()):\n",
-    "        added_count = 0\n",
-    "        \n",
-    "        # 遍历每一个“潜在的父集和弦” (Parent)\n",
-    "        for parent_label, parent_notes in chord_definitions.items():\n",
-    "            \n",
-    "            # 跳过自己 (自己已经被初始化在 Inclusive_Count 里了)\n",
-    "            if child_label == parent_label:\n",
-    "                continue\n",
-    "            \n",
-    "            # 核心判断：Child 是否被 Parent 完全包含？\n",
-    "            # issubset() 是数学上的 A ⊆ B\n",
-    "            if child_notes.issubset(parent_notes):\n",
-    "                # 如果包含，把 Parent 的原始次数贡献给 Child\n",
-    "                added_count += raw_counts[parent_label]\n",
-    "        \n",
-    "        # 更新 DataFrame\n",
-    "        df_incl.at[child_label, 'Inclusive_Count'] += added_count\n",
-    "\n",
-    "    # 4. 重新计算频率\n",
-    "    # 注意：分母依然使用“总 Token 数”，这样包含性频率之和会超过 100%，\n",
-    "    # 这代表了“该和声结构出现的总概率”。\n",
-    "    total_tokens = stats_df['Count'].sum()\n",
-    "    df_incl['Inclusive_Freq (%)'] = (df_incl['Inclusive_Count'] / total_tokens) * 100\n",
-    "    \n",
-    "    # 5. 整理列顺序，把 Inclusive 放在前面方便看\n",
-    "    cols = ['Inclusive_Count', 'Inclusive_Freq (%)', 'Count', 'Frequency (%)', 'Pitches']\n",
-    "    # 仅保留存在的列\n",
-    "    cols = [c for c in cols if c in df_incl.columns]\n",
-    "    \n",
-    "    # 按包含性计数降序排列\n",
-    "    return df_incl[cols].sort_values(by='Inclusive_Count', ascending=False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "4b424d62-d430-4426-b41a-9b1db60b5d20",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# 1. 创建副本并初始化新列\n",
-    "# 我们直接操作 roman_stats，先创建一个 Inclusive_Count 列，初始值等于原始 Count\n",
-    "roman_stats['Inclusive_Count'] = roman_stats['Count']\n",
-    "\n",
-    "# 2. 准备数据：将 Pitches 转换为 Python 集合 (Set) 以加速比对\n",
-    "# 生成一个字典: {'I': {0, 4, 7}, 'V7': {2, 5, 7, 11}, ...}\n",
-    "chord_definitions = {\n",
-    "    label: set(pitches) \n",
-    "    for label, pitches in roman_stats['Pitches'].items()\n",
-    "}\n",
-    "\n",
-    "# 获取原始计数的字典，方便快速查找，避免在循环中反复访问 DataFrame\n",
-    "raw_counts = roman_stats['Count'].to_dict()\n",
-    "\n",
-    "print(f\"正在计算 {len(chord_definitions)} 个和弦类型之间的包含关系...\")\n",
-    "\n",
-    "# 3. 双重循环计算包含关系\n",
-    "# 外层循环：遍历每一个“潜在的子集和弦” (Child)\n",
-    "for child_label, child_notes in tqdm(chord_definitions.items()):\n",
-    "    added_count = 0\n",
-    "    \n",
-    "    # 内层循环：遍历每一个“潜在的父集和弦” (Parent)\n",
-    "    for parent_label, parent_notes in chord_definitions.items():\n",
-    "        \n",
-    "        # 跳过自己\n",
-    "        if child_label == parent_label:\n",
-    "            continue\n",
-    "        \n",
-    "        # 核心判断：Child 的音阶是否完全包含于 Parent 中？\n",
-    "        # issubset() 是集合运算 A ⊆ B\n",
-    "        if child_notes.issubset(parent_notes):\n",
-    "            # 如果包含，累加 Parent 的原始次数\n",
-    "            added_count += raw_counts[parent_label]\n",
-    "    \n",
-    "    # 将累加的额外次数加到 Inclusive_Count 中\n",
-    "    roman_stats.at[child_label, 'Inclusive_Count'] += added_count\n",
-    "\n",
-    "# 4. 计算包含性频率 (Inclusive Frequency)\n",
-    "# 分母使用原始的总和弦数 (Frequency 可能会超过 100%)\n",
-    "total_original_tokens = roman_stats['Count'].sum()\n",
-    "roman_stats['Inclusive_Freq (%)'] = (roman_stats['Inclusive_Count'] / total_original_tokens) * 100\n",
-    "\n",
-    "# 5. 整理列顺序并排序\n",
-    "# 把 Inclusive 的数据放到前面方便观察\n",
-    "cols = ['Pitches', 'Inclusive_Count', 'Inclusive_Freq (%)', 'Count', 'Frequency (%)']\n",
-    "roman_stats = roman_stats[cols].sort_values(by='Inclusive_Count', ascending=False)\n",
-    "\n",
-    "# 6. 查看结果\n",
-    "print(\"\\n=== 更新后的统计表 (Top 10) ===\")\n",
-    "print(roman_stats.head(10))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5d50de13-005d-4066-ab9a-7d24a64ffc75",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "id": "08f82abf-66d1-4227-a714-b0564c54c06f",
-   "metadata": {},
-   "source": [
-    "Testing task:\n",
-    "1. The big matrix containing all songs\n",
-    "2. Divided big matrixes containing songs in their genre"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d7d6aef8-4ac9-48f5-8110-cdc3f7646f4b",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "3ba99f27-1991-41e7-a183-85df750886f3",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "622bb96f-c985-4e4e-a800-e26edca92e85",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2b9ee7ba-0c9a-4894-b060-c3c98178a2da",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c34f672f-e010-43b4-8d5b-650591993a78",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e31a10d4-f5af-4fdc-9ee5-e7242ab8acab",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d6959d67-a0a0-4242-84b5-53f318d46a24",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.13.5"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}

From cbdc29e13ea9ed0a771fc2a1858acabd88971f31 Mon Sep 17 00:00:00 2001
From: pierrick <pierrick.lry@gmail.com>
Date: Mon, 24 Nov 2025 18:44:50 +0100
Subject: [PATCH 12/14] improve code on PEP8 standards

---
 test/pipeline/test_pipeline.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/test/pipeline/test_pipeline.py b/test/pipeline/test_pipeline.py
index bd0a9a8a..a7ea524e 100644
--- a/test/pipeline/test_pipeline.py
+++ b/test/pipeline/test_pipeline.py
@@ -1,7 +1,6 @@
 """Test pipeline for a particular dataset and model."""
 
 import hydra
-from test._utils.simplified_pipeline import run
 import lightning as pl
 import torch
 from omegaconf import OmegaConf
@@ -21,7 +20,7 @@ class TestPipeline:
     def setup_method(self):
         """Setup method."""
         hydra.core.global_hydra.GlobalHydra.instance().clear()
-    
+
     def test_pipeline(self):
         """Test pipeline."""
         config_dataset = OmegaConf.load("configs/dataset/hypergraph/chordonomicon.yaml")
@@ -83,7 +82,7 @@ def __init__(self,
                 self.linear_node_0 = torch.nn.Linear(dim_in_node, dim_hidden)
                 self.linear_hyperedge_0 = torch.nn.Linear(dim_hidden, dim_out)
 
-            def forward(self, batch):
+            def forward(self, batch):  #pylint: disable=arguments-differ
                 """Forward pass.
 
                 Parameters
@@ -96,7 +95,9 @@ def forward(self, batch):
                 dict
                     Output dictionary containing node representation and hyperedge logits.
                 """
-                x_node = torch.concat((batch.x, torch.sparse.mm(batch.incidence_hyperedges, batch.x_hyperedges)), dim=1)
+                x_node = torch.concat((batch.x,
+                                torch.sparse.mm(batch.incidence_hyperedges, batch.x_hyperedges)),  #pylint: disable=not-callable
+                                      dim=1)
                 h_node = self.linear_node_0(x_node)
                 h_node = torch.relu(h_node)
                 h_hyperedge = torch.mm(batch.incidence_hyperedges.T, h_node)
@@ -111,17 +112,16 @@ def forward(self, batch):
         dataset_loader = instantiate(config_loader)
         dataset, dataset_dir = dataset_loader.load()
         preprocessor = PreProcessor(dataset, dataset_dir)
-        dataset_train, dataset_val, dataset_test = preprocessor.load_dataset_splits(config_dataset.split_params)
+        dataset_train, dataset_val, dataset_test = preprocessor.load_dataset_splits(config_dataset.split_params)  #pylint: disable=line-too-long
         datamodule = TBDataloader(
                     dataset_train=dataset_train,
                     dataset_val=dataset_val,
                     dataset_test=dataset_test,
                     **config_dataset.get("dataloader_params", {}),
                 )
-        dataloader_train = datamodule.train_dataloader()
 
         # model
-        backbone = ModelPipeLine(dim_in_node=config_dataset.parameters.num_node_features+config_dataset.parameters.num_edge_features,
+        backbone = ModelPipeLine(dim_in_node=config_dataset.parameters.num_node_features+config_dataset.parameters.num_edge_features,  #pylint: disable=line-too-long
                                 dim_hidden=10,
                                 dim_out=config_dataset.parameters.num_classes)
         loss = TBLoss(config_loss["dataset_loss"])
@@ -137,7 +137,10 @@ def forward(self, batch):
                         compile=False)
 
         # train
-        trainer = pl.Trainer(max_epochs=3, accelerator="cpu", enable_progress_bar=False, log_every_n_steps=1)
+        trainer = pl.Trainer(max_epochs=3,
+                             accelerator="cpu",
+                             enable_progress_bar=False,
+                             log_every_n_steps=1)
         trainer.fit(model, datamodule)
         trainer.test(model, datamodule)
         test_metrics = trainer.callback_metrics

From e1eca870e74c8ebc7012b814915dbc8a6a0432e5 Mon Sep 17 00:00:00 2001
From: pierrick <pierrick.lry@gmail.com>
Date: Mon, 24 Nov 2025 22:58:33 +0100
Subject: [PATCH 13/14] added the possibility to switch before single scale and
 all scales datasets

---
 configs/dataset/hypergraph/chordonomicon.yaml |  8 ++--
 test/pipeline/test_pipeline.py                | 38 ++++++++-----------
 topobench/data/datasets/chordonomicon.py      | 27 ++++++++++---
 .../hypergraph/chordonomicon_loader.py        | 15 ++++++--
 4 files changed, 54 insertions(+), 34 deletions(-)

diff --git a/configs/dataset/hypergraph/chordonomicon.yaml b/configs/dataset/hypergraph/chordonomicon.yaml
index 007a6435..b2e2fbc4 100644
--- a/configs/dataset/hypergraph/chordonomicon.yaml
+++ b/configs/dataset/hypergraph/chordonomicon.yaml
@@ -6,13 +6,15 @@ loader:
     data_type: chords
     data_name: chordonomicon
     data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type}
+    version: all_scales  # options: ['single_scale', 'all_scales']
 
 # Dataset parameters
 parameters:
   num_features: 1
   num_classes: 1
   num_edge_features: 1
-  num_node_features: 12
+  num_node_features_single_scale: 12
+  num_node_features_all_scales: 38
   task: regression
   loss_type: mse
   monitor_metric: mae
@@ -24,9 +26,9 @@ split_params:
   data_seed: 0
   split_type: random #'k-fold' # either "k-fold" or "random" strategies
   k: 10 # for "k-fold" Cross-Validation
-  train_prop: 0.8 # for "random" strategy splitting
+  train_prop: 0.9 # for "random" strategy splitting
   standardize: False
-  data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name}
+  data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name}/${dataset.loader.parameters.version}
 
 # Dataloader parameters
 dataloader_params:
diff --git a/test/pipeline/test_pipeline.py b/test/pipeline/test_pipeline.py
index a7ea524e..d4017706 100644
--- a/test/pipeline/test_pipeline.py
+++ b/test/pipeline/test_pipeline.py
@@ -23,37 +23,26 @@ def setup_method(self):
 
     def test_pipeline(self):
         """Test pipeline."""
+        
+        # configs
         config_dataset = OmegaConf.load("configs/dataset/hypergraph/chordonomicon.yaml")
-        config_dataset.split_params.data_split_dir = "datasets/data_splits/chordonomicon"
+        config_dataset.split_params.data_split_dir = f"datasets/data_splits/chordonomicon/{config_dataset.loader.parameters.version}"  # pylint: disable=line-too-long
         config_dataset.loader.parameters.data_dir = "datasets/hypergraph/chords"
-        config_loader = {"_target_":"topobench.data.loaders.ChordonomiconDatasetLoader",
-                        "parameters":
-                        {"data_domain": "hypergraph",
-                            "data_type": "chords",
-                            "data_name": "chordonomicon",
-                            "data_dir": "datasets/hypergraph/chords"
-                            }
-                        }
-
         config_evaluator = {"task": "regression",
                             "num_classes": config_dataset.parameters.num_classes,
                             "metrics": ["rmse", "mse", "mae"]}
-
-        config_loss = {
-            "dataset_loss": 
-                {
-                    "task": "regression", 
-                    "loss_type": "mse"
+        config_loss = {"dataset_loss":
+            {
+                "task": "regression", 
+                "loss_type": "mse"
                 }
-        }
-
+            }
         config_readout = {
             "hidden_dim": config_dataset.parameters.num_classes,
             "out_channels": config_dataset.parameters.num_classes,
             "task_level": config_dataset.parameters.task_level,
             "logits_linear_layer": False,
-        }
-
+            }
         config_optimizer = {"optimizer_id": "Adam",
                             "parameters":
                                 {"lr": 0.01,"weight_decay": 0.0005}
@@ -109,7 +98,7 @@ def forward(self, batch):  #pylint: disable=arguments-differ
                 return model_out
 
         # dataset
-        dataset_loader = instantiate(config_loader)
+        dataset_loader = instantiate(config_dataset.loader)
         dataset, dataset_dir = dataset_loader.load()
         preprocessor = PreProcessor(dataset, dataset_dir)
         dataset_train, dataset_val, dataset_test = preprocessor.load_dataset_splits(config_dataset.split_params)  #pylint: disable=line-too-long
@@ -121,7 +110,12 @@ def forward(self, batch):  #pylint: disable=arguments-differ
                 )
 
         # model
-        backbone = ModelPipeLine(dim_in_node=config_dataset.parameters.num_node_features+config_dataset.parameters.num_edge_features,  #pylint: disable=line-too-long
+        if config_dataset.loader.parameters.version == "single_scale":
+            input_dim = config_dataset.parameters.num_node_features_single_scale
+        elif config_dataset.loader.parameters.version == "all_scales":
+            input_dim = config_dataset.parameters.num_node_features_all_scales
+        input_dim += config_dataset.parameters.num_edge_features
+        backbone = ModelPipeLine(dim_in_node=input_dim,
                                 dim_hidden=10,
                                 dim_out=config_dataset.parameters.num_classes)
         loss = TBLoss(config_loss["dataset_loss"])
diff --git a/topobench/data/datasets/chordonomicon.py b/topobench/data/datasets/chordonomicon.py
index 4e019a56..38531ee9 100644
--- a/topobench/data/datasets/chordonomicon.py
+++ b/topobench/data/datasets/chordonomicon.py
@@ -22,14 +22,19 @@ class ChordonomiconDataset(InMemoryDataset):
         and processed will be subdirectories of it.
     name : str
         Name of the dataset (e.g., 'Chordonomicon').
+    version : str
+        Version of the dataset, options are 'single_scale' or 'all_scales'.
     """
 
-    URL = "https://huggingface.co/datasets/PierrickLeKing/topobench-music-synergy/resolve/main/dataframe.zip"  # pylint: disable=line-too-long
-
-    def __init__(self, root, name):
+    def __init__(self, root, name, version):
         self.name = name
         self.root = root
+        self.version = version
         self.folder_chordonomicon = osp.join(self.root, self.name)
+        if self.version == "single_scale":
+            self.url = "https://huggingface.co/datasets/PierrickLeKing/topobench-music-synergy/resolve/main/dataframe_226.zip"  # pylint: disable=line-too-long
+        elif self.version == "all_scales":
+            self.url = "https://huggingface.co/datasets/PierrickLeKing/topobench-music-synergy/resolve/main/dataframe_4313.zip"  # pylint: disable=line-too-long
         super().__init__(
             root,
         )
@@ -43,7 +48,7 @@ def download(self):
         Raises:
             requests.exceptions.HTTPError: If the download fails.
         """
-        r = requests.get(self.URL, timeout=30)
+        r = requests.get(self.url, timeout=30)
         r.raise_for_status()
         with open(
             osp.join(self.folder_chordonomicon, "dataframe.zip"), "wb"
@@ -109,7 +114,12 @@ def raw_file_names(self) -> list[str]:
         list[str]
             List of raw file names.
         """
-        return ["dataframe.csv"]
+        if self.version == "single_scale":
+            return ["dataframe_226.csv"]
+        elif self.version == "all_scales":
+            return ["dataframe_4313.csv"]
+        else:
+            raise ValueError(f"Unknown version: {self.version}")
 
     @property
     def processed_file_names(self) -> str:
@@ -120,7 +130,12 @@ def processed_file_names(self) -> str:
         str
             Processed file name.
         """
-        return "data.pt"
+        if self.version == "single_scale":
+            return "data_226.pt"
+        elif self.version == "all_scales":
+            return "data_4313.pt"
+        else:
+            raise ValueError(f"Unknown version: {self.version}")
 
     @property
     def raw_dir(self) -> str:
diff --git a/topobench/data/loaders/hypergraph/chordonomicon_loader.py b/topobench/data/loaders/hypergraph/chordonomicon_loader.py
index 2e04809f..3d8bd96d 100644
--- a/topobench/data/loaders/hypergraph/chordonomicon_loader.py
+++ b/topobench/data/loaders/hypergraph/chordonomicon_loader.py
@@ -7,12 +7,19 @@
 class ChordonomiconDatasetLoader(AbstractLoader):
     """Loader class for Chordonomicon dataset.
 
-    Args:
-        - parameters (DictConfig): Loader parameters.
+    Parameters
+    ----------
+    parameters : DictConfig
+        Configuration parameters containing:
             - data_dir (str): Root directory where the dataset folder is stored.
             - data_name (str): Name of the dataset.
+            - version (str): Version of the dataset, options are 'single_scale', 'all_scales'.
     """
 
+    def __init__(self, parameters):
+        super().__init__(parameters)
+        self.version = parameters.version
+
     def load_dataset(self) -> ChordonomiconDataset:
         """Load the Chordonomicon dataset.
 
@@ -22,5 +29,7 @@ def load_dataset(self) -> ChordonomiconDataset:
             The loaded Chordonomicon dataset.
         """
         return ChordonomiconDataset(
-            root=self.root_data_dir, name=self.parameters.data_name
+            root=self.root_data_dir,
+            name=self.parameters.data_name,
+            version=self.version,
         )

From 54849fcf6793e270c9ea3eaf2b60a86ca9499f95 Mon Sep 17 00:00:00 2001
From: pierrick <pierrick.lry@gmail.com>
Date: Mon, 24 Nov 2025 23:59:27 +0100
Subject: [PATCH 14/14] improved PEP

---
 test/pipeline/test_pipeline.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/pipeline/test_pipeline.py b/test/pipeline/test_pipeline.py
index d4017706..fdcc8aa7 100644
--- a/test/pipeline/test_pipeline.py
+++ b/test/pipeline/test_pipeline.py
@@ -23,7 +23,7 @@ def setup_method(self):
 
     def test_pipeline(self):
         """Test pipeline."""
-        
+
         # configs
         config_dataset = OmegaConf.load("configs/dataset/hypergraph/chordonomicon.yaml")
         config_dataset.split_params.data_split_dir = f"datasets/data_splits/chordonomicon/{config_dataset.loader.parameters.version}"  # pylint: disable=line-too-long
@@ -110,11 +110,11 @@ def forward(self, batch):  #pylint: disable=arguments-differ
                 )
 
         # model
+        input_dim = config_dataset.parameters.num_edge_features
         if config_dataset.loader.parameters.version == "single_scale":
-            input_dim = config_dataset.parameters.num_node_features_single_scale
+            input_dim += config_dataset.parameters.num_node_features_single_scale
         elif config_dataset.loader.parameters.version == "all_scales":
-            input_dim = config_dataset.parameters.num_node_features_all_scales
-        input_dim += config_dataset.parameters.num_edge_features
+            input_dim += config_dataset.parameters.num_node_features_all_scales
         backbone = ModelPipeLine(dim_in_node=input_dim,
                                 dim_hidden=10,
                                 dim_out=config_dataset.parameters.num_classes)

	id	chords	release_date	genres	decade	rock_genre	artist_id	main_genre	spotify_song_id	spotify_artist_id
0	1	<intro_1> C <verse_1> F C E7 Amin C F C G7 C F...	NaN	'classic country pop'	NaN	NaN	artist_1	pop	NaN	4AIEGdwDzPELXYgM5JaEY5
1	2	<intro_1> E D A/Cs E D A/Cs <verse_1> E D A/Cs...	2003-01-01	'alternative metal' 'alternative rock' 'nu met...	2000.0	pop rock	artist_2	metal	2ffJZ2r8HxI5DHcmf3BO6c	694QW15WkebjcrWgQHzRYF
2	3	<intro_1> Csmin <verse_1> A Csmin A Csmin A Cs...	2003-01-01	'alternative metal' 'canadian rock' 'funk meta...	2000.0	canadian rock	artist_3	metal	5KiY8SZEnvCPyIEkFGRR3y	0niJkG4tKkne3zwr7I8n9n
3	4	<intro_1> D Dmaj7 D Dmaj7 <verse_1> Emin A D G...	2022-09-23	NaN	2020.0	NaN	artist_4	NaN	01TtAcUqyLCRBZq4ZZiQWS	17BfKBemmMGO5ZAK25wraW
4	5	<intro_1> C <verse_1> G C G C <chorus_1> F Dmi...	2023-02-10	'modern country pop'	2020.0	NaN	artist_5	pop	3zUecdrWC3IqrNSjhnoF3G	4GGfAshSkqoxpZdoaHm7ky