diff --git a/learn/search/audio/audio-search/audio-search.ipynb b/learn/search/audio/audio-search/audio-search.ipynb index fe44e65a..f42310b6 100644 --- a/learn/search/audio/audio-search/audio-search.ipynb +++ b/learn/search/audio/audio-search/audio-search.ipynb @@ -1,1191 +1,979 @@ { - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pinecone-io/examples/blob/master/learn/search/audio/audio-search/audio-search.ipynb) [![Open nbviewer](https://raw.githubusercontent.com/pinecone-io/examples/master/assets/nbviewer-shield.svg)](https://nbviewer.org/github/pinecone-io/examples/blob/master/learn/search/audio/audio-search/audio-search.ipynb)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "CLWK0hHDNLty" - }, - "source": [ - "# Audio Similarity Search" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Bz1ITFq_NPxP" - }, - "source": [ - "This notebook shows how to use Pinecone as the vector DB within an audio search application. Audio search can be used to find songs and metadata within a catalog, finding similar sounds in an audio library, or detecting who's speaking in an audio file.\n", - "\n", - "We will index a set of audio recordings as vector embeddings. These vector embeddings are rich, mathematical representations of the audio recordings, making it possible to determine how similar the recordings are to one another. We will then take some new (unseen) audio recording, search through the index to find the most similar matches, and play the returned audio in this notebook." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "IjQ-SGWuNP1-" - }, - "source": [ - "# Install Dependencies" - ] + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pinecone-io/examples/blob/master/learn/search/audio/audio-search/audio-search.ipynb) [![Open nbviewer](https://raw.githubusercontent.com/pinecone-io/examples/master/assets/nbviewer-shield.svg)](https://nbviewer.org/github/pinecone-io/examples/blob/master/learn/search/audio/audio-search/audio-search.ipynb)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CLWK0hHDNLty" + }, + "source": [ + "# Audio Similarity Search" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Bz1ITFq_NPxP" + }, + "source": [ + "This notebook shows how to use Pinecone as the vector DB within an audio search application. Audio search can be used to find songs and metadata within a catalog, finding similar sounds in an audio library, or detecting who's speaking in an audio file.\n", + "\n", + "We will index a set of audio recordings as vector embeddings. These vector embeddings are rich, mathematical representations of the audio recordings, making it possible to determine how similar the recordings are to one another. We will then take some new (unseen) audio recording, search through the index to find the most similar matches, and play the returned audio in this notebook." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IjQ-SGWuNP1-" + }, + "source": [ + "# Install Dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lNPlTs8HGWOh" + }, + "outputs": [], + "source": "!pip install -qU pinecone==8.0.0 panns-inference==0.1.1 datasets==3.2.0 librosa==0.10.2 torch==2.6.0" + }, + { + "cell_type": "code", + "source": "try:\n import os\n import time\n import urllib.request\n from getpass import getpass\n\n import librosa\n import numpy as np\n import torch\n from datasets import load_dataset\n from IPython.display import Audio, display\n from panns_inference import AudioTagging\n from pinecone import Pinecone, ServerlessSpec\n from tqdm.auto import tqdm\nexcept ImportError as e:\n raise ImportError(f\"Missing required package. Run the dependency cell first: {e}\")", + "metadata": {}, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TlUIo29INUhs" + }, + "source": [ + "# Load Dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "520whID8rU8g" + }, + "source": [ + "In this demo, we will use audio from the *ESC-50 dataset* — a labeled collection of 2000 environmental audio recordings, which are 5-second-long each. The dataset can be loaded from the HuggingFace model hub as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "lNPlTs8HGWOh" - }, - "outputs": [], - "source": [ - "!pip install -qU pinecone-client==3.1.0 panns-inference datasets librosa" - ] + "id": "-XE4XUVnJnXE", + "outputId": "db23db43-e7c1-424b-f218-13438419943a" + }, + "outputs": [], + "source": "# load the dataset from huggingface model hub\ndata = load_dataset(\"ashraq/esc50\", split=\"train\")\ndata" + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QeXLy-dFvIL7" + }, + "source": [ + "The audios in the dataset are sampled at 44100Hz and loaded into NumPy arrays. Let's take a look." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Hcm6KZDsvFZd", + "outputId": "5856454a-03ed-4f9b-cf6d-cabe40e1f39e" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'path': None,\n", + " 'array': array([0., 0., 0., ..., 0., 0., 0.]),\n", + " 'sampling_rate': 44100},\n", + " {'path': None,\n", + " 'array': array([-0.01184082, -0.10336304, -0.14141846, ..., 0.06985474,\n", + " 0.04049683, 0.00274658]),\n", + " 'sampling_rate': 44100},\n", + " {'path': None,\n", + " 'array': array([-0.00695801, -0.01251221, -0.01126099, ..., 0.215271 ,\n", + " -0.00875854, -0.28903198]),\n", + " 'sampling_rate': 44100}]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# select the audio feature and display top three\n", + "audios = data[\"audio\"]\n", + "audios[:3]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QX94cPW8AtQl" + }, + "source": [ + "We only need the Numpy arrays as these contain all of the audio data. We will later input these Numpy arrays directly into our embedding model to generate audio embeddings." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "jDrQhpJ5Az0G" + }, + "outputs": [], + "source": "# select only the audio data from the dataset and store in a numpy array\naudios = np.array([a[\"array\"] for a in data[\"audio\"]])" + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QAhNFeEWNb9_" + }, + "source": [ + "# Load Audio Embedding Model" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HLI4j7798AT9" + }, + "source": [ + "We will use an audio tagging model trained from *PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition* paper to generate our audio embeddings. We use the *panns_inference* Python package, which provides an easy interface to load and use the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - { - "cell_type": "markdown", - "metadata": { - "id": "TlUIo29INUhs" - }, - "source": [ - "# Load Dataset" - ] + "id": "PMwze3wAI4Vg", + "outputId": "60e87993-20c1-45b3-f1c9-b2d78e9b6d5e" + }, + "outputs": [], + "source": "# Detect if GPU is available, otherwise use CPU\ndevice = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n\n# load the model with the appropriate device\nmodel = AudioTagging(checkpoint_path=None, device=device)" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initializing the Index\n", + "\n", + "Now we need a place to store these embeddings and enable a efficient vector search through them all. To do that we use Pinecone, we can get a [free API key](https://app.pinecone.io/) and enter it below where we will initialize our connection to Pinecone and create a new index." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": "# initialize connection to pinecone (get API key at app.pinecone.io)\napi_key = os.environ.get(\"PINECONE_API_KEY\") or getpass(\"Enter your Pinecone API key: \")\n\npc = Pinecone(api_key=api_key)" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "Now we setup our index specification, this allows us to define the cloud provider and region where we want to deploy our index. You can find a list of all [available providers and regions here](https://docs.pinecone.io/guides/projects/understanding-projects)." + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": "cloud = os.environ.get(\"PINECONE_CLOUD\") or \"aws\"\nregion = os.environ.get(\"PINECONE_REGION\") or \"us-east-1\"\n\nspec = ServerlessSpec(cloud=cloud, region=region)" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create the index:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "index_name = \"audio-search-demo\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3b_pV4GGNXJb" + }, + "outputs": [], + "source": "# check if index already exists (it shouldn't if this is first time)\nif index_name not in pc.list_indexes().names():\n # if does not exist, create index\n pc.create_index(index_name, dimension=2048, metric=\"cosine\", spec=spec)\n # wait for index to be initialized\n while not pc.describe_index(index_name).status[\"ready\"]:\n time.sleep(1)\n\n# connect to index\nindex = pc.Index(index_name)\n# view index stats\nindex.describe_index_stats()" + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4AhWDbI3NiIi" + }, + "source": [ + "# Generate Embeddings and Upsert\n", + "\n", + "Now we generate the embeddings using the audio embedding model. We must do this in batches as processing all items at once will exhaust machine memory limits and API request limits." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 118, + "referenced_widgets": [ + "6a9f8ff091324f1581297fc28c713047", + "e18fc6710ff341d6b80cb93cd6dc5e19", + "0b927de4df4b4f61bbb10721007fb4f8", + "74ea1cff7a2d44d08328a714e2cc384f", + "5890090cbebe417dbac772b18440eaa5", + "27c31780e6c24813a0450aa33605b81f", + "c9c22815392c47b3860786ad891445ad", + "0f3d7cc7013b4b3aa9ed394876004f7e", + "a0dc1d554f2c43869d3f399501d9c689", + "713536b27c57438c8e4d193fbc5c4f4c", + "8fcd7e3260f2445cbe6d082ed56f0a9d" + ] + }, + "id": "l_l0qa-qPCqC", + "outputId": "26c363e0-e480-4fbc-a258-e99ed73b1eab" + }, + "outputs": [], + "source": "# we will use batches of 64\nbatch_size = 64\n\nfor i in tqdm(range(0, len(audios), batch_size)):\n # find end of batch\n i_end = min(i + batch_size, len(audios))\n # extract batch\n batch = audios[i:i_end]\n # generate embeddings for all the audios in the batch\n _, emb = model.inference(batch)\n # create unique IDs\n ids = [f\"{idx}\" for idx in range(i, i_end)]\n # add all to upsert list\n to_upsert = list(zip(ids, emb.tolist()))\n # upsert/insert these records to pinecone\n _ = index.upsert(vectors=to_upsert)\n\n# check that we have all vectors in index\nindex.describe_index_stats()" + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LG-Wg970DKKh" + }, + "source": [ + "We now have *2000* audio records indexed in Pinecone, we're ready to begin querying." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "x3BvGOl6TI_Z" + }, + "source": [ + "# Querying" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e3UOXzJ-DZvo" + }, + "source": [ + "Let's first listen to an audio from our dataset. We will generate embeddings for the audio and use it to find similar audios from the Pinecone index." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 93 + }, + "id": "bwEETtYuDYEZ", + "outputId": "f556e117-f471-4c4d-bbf7-64428b16bf30" + }, + "outputs": [], + "source": "# we set an audio number to select from the dataset\naudio_num = 400\n# get the audio data of the audio number\nquery_audio = data[audio_num][\"audio\"][\"array\"]\n# get the category of the audio number\ncategory = data[audio_num][\"category\"]\n# print the category and play the audio\nprint(\"Query Audio:\", category)\nAudio(query_audio, rate=44100)" + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HPxbFSqwFE9R" + }, + "source": [ + "We have got the sound of a car horn. Let's generate an embedding for this sound." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "jD1qPLRjFzG0", + "outputId": "948ac450-28ea-4d8c-c99e-4bbaa5d8ef50" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "520whID8rU8g" - }, - "source": [ - "In this demo, we will use audio from the *ESC-50 dataset* \u2014 a labeled collection of 2000 environmental audio recordings, which are 5-second-long each. The dataset can be loaded from the HuggingFace model hub as follows:" + "data": { + "text/plain": [ + "(1, 2048)" ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "-XE4XUVnJnXE", - "outputId": "db23db43-e7c1-424b-f218-13438419943a" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:datasets.builder:Using custom data configuration ashraq--esc50-60cc11cf57cf8497\n", - "WARNING:datasets.builder:Found cached dataset parquet (/root/.cache/huggingface/datasets/ashraq___parquet/ashraq--esc50-60cc11cf57cf8497/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)\n" - ] - }, - { - "data": { - "text/plain": [ - "Dataset({\n", - " features: ['filename', 'fold', 'target', 'category', 'esc10', 'src_file', 'take', 'audio'],\n", - " num_rows: 2000\n", - "})" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# reshape query audio\n", + "query_audio = query_audio[None, :]\n", + "# get the embeddings for the audio from the model\n", + "_, xq = model.inference(query_audio)\n", + "xq.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "42V_53GcGaKB" + }, + "source": [ + "We have now converted the audio into a 2048-dimension vector the same way we did for all the other audio we indexed. Let's use this to query our Pinecone index." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "65-In6BEGUw6", + "outputId": "b2550582-38a7-4067-f1a7-fa9e956b0cab" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'matches': [{'id': '400', 'score': 1.0, 'values': []},\n", + " {'id': '1667', 'score': 0.842124522, 'values': []},\n", + " {'id': '1666', 'score': 0.831768811, 'values': []}],\n", + " 'namespace': ''}" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# query pinecone index with the query audio embeddings\n", + "results = index.query(vector=xq.tolist(), top_k=3)\n", + "results" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "q5zJCjxpITgH" + }, + "source": [ + "Notice that the top result is the audio number 400 from our dataset, which is our query audio (the most similar item should always be the query itself). Let's listen to the top three results. " + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 193 + }, + "id": "mVk2BGiVI1_a", + "outputId": "0460c9b4-0496-4029-f724-0a442b3bbb6b" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " ], - "source": [ - "from datasets import load_dataset\n", - "\n", - "# load the dataset from huggingface model hub\n", - "data = load_dataset(\"ashraq/esc50\", split=\"train\")\n", - "data" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "QeXLy-dFvIL7" - }, - "source": [ - "The audios in the dataset are sampled at 44100Hz and loaded into NumPy arrays. Let's take a look." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Hcm6KZDsvFZd", - "outputId": "5856454a-03ed-4f9b-cf6d-cabe40e1f39e" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'path': None,\n", - " 'array': array([0., 0., 0., ..., 0., 0., 0.]),\n", - " 'sampling_rate': 44100},\n", - " {'path': None,\n", - " 'array': array([-0.01184082, -0.10336304, -0.14141846, ..., 0.06985474,\n", - " 0.04049683, 0.00274658]),\n", - " 'sampling_rate': 44100},\n", - " {'path': None,\n", - " 'array': array([-0.00695801, -0.01251221, -0.01126099, ..., 0.215271 ,\n", - " -0.00875854, -0.28903198]),\n", - " 'sampling_rate': 44100}]" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " ], - "source": [ - "# select the audio feature and display top three\n", - "audios = data[\"audio\"]\n", - "audios[:3]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "QX94cPW8AtQl" - }, - "source": [ - "We only need the Numpy arrays as these contain all of the audio data. We will later input these Numpy arrays directly into our embedding model to generate audio embeddings." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "id": "jDrQhpJ5Az0G" - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "\n", - "# select only the audio data from the dataset and store in a numpy array\n", - "audios = np.array([a[\"array\"] for a in data[\"audio\"]])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "QAhNFeEWNb9_" - }, - "source": [ - "# Load Audio Embedding Model" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "HLI4j7798AT9" - }, - "source": [ - "We will use an audio tagging model trained from *PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition* paper to generate our audio embeddings. We use the *panns_inference* Python package, which provides an easy interface to load and use the model." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "PMwze3wAI4Vg", - "outputId": "60e87993-20c1-45b3-f1c9-b2d78e9b6d5e" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Checkpoint path: /root/panns_data/Cnn14_mAP=0.431.pth\n", - "GPU number: 1\n" - ] - } + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " ], - "source": [ - "from panns_inference import AudioTagging\n", - "\n", - "# load the default model into the gpu.\n", - "model = AudioTagging(checkpoint_path=None, device='cuda') # change device to cpu if a gpu is not available" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Initializing the Index\n", - "\n", - "Now we need a place to store these embeddings and enable a efficient vector search through them all. To do that we use Pinecone, we can get a [free API key](https://app.pinecone.io/) and enter it below where we will initialize our connection to Pinecone and create a new index." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "from pinecone import Pinecone\n", - "\n", - "# initialize connection to pinecone (get API key at app.pinecone.io)\n", - "api_key = os.environ.get('PINECONE_API_KEY') or 'PINECONE_API_KEY'\n", - "\n", - "# configure client\n", - "pc = Pinecone(api_key=api_key)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we setup our index specification, this allows us to define the cloud provider and region where we want to deploy our index. You can find a list of all [available providers and regions here](https://docs.pinecone.io/docs/projects)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pinecone import ServerlessSpec\n", - "\n", - "cloud = os.environ.get('PINECONE_CLOUD') or 'aws'\n", - "region = os.environ.get('PINECONE_REGION') or 'us-east-1'\n", - "\n", - "spec = ServerlessSpec(cloud=cloud, region=region)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create the index:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "index_name = \"audio-search-demo\"" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "id": "3b_pV4GGNXJb" - }, - "outputs": [], - "source": [ - "import time\n", - "\n", - "# check if index already exists (it shouldn't if this is first time)\n", - "if index_name not in pc.list_indexes().names():\n", - " # if does not exist, create index\n", - " pc.create_index(\n", - " index_name,\n", - " dimension=2048,\n", - " metric='cosine',\n", - " spec=spec\n", - " )\n", - " # wait for index to be initialized\n", - " while not pc.describe_index(index_name).status['ready']:\n", - " time.sleep(1)\n", - "\n", - "# connect to index\n", - "index = pc.Index(index_name)\n", - "# view index stats\n", - "index.describe_index_stats()" + "text/plain": [ + "" ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4AhWDbI3NiIi" - }, - "source": [ - "# Generate Embeddings and Upsert\n", - "\n", - "Now we generate the embeddings using the audio embedding model. We must do this in batches as processing all items at once will exhaust machine memory limits and API request limits." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 118, - "referenced_widgets": [ - "6a9f8ff091324f1581297fc28c713047", - "e18fc6710ff341d6b80cb93cd6dc5e19", - "0b927de4df4b4f61bbb10721007fb4f8", - "74ea1cff7a2d44d08328a714e2cc384f", - "5890090cbebe417dbac772b18440eaa5", - "27c31780e6c24813a0450aa33605b81f", - "c9c22815392c47b3860786ad891445ad", - "0f3d7cc7013b4b3aa9ed394876004f7e", - "a0dc1d554f2c43869d3f399501d9c689", - "713536b27c57438c8e4d193fbc5c4f4c", - "8fcd7e3260f2445cbe6d082ed56f0a9d" - ] - }, - "id": "l_l0qa-qPCqC", - "outputId": "26c363e0-e480-4fbc-a258-e99ed73b1eab" - }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "6a9f8ff091324f1581297fc28c713047", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/32 [00:00\n", + " \n", + " Your browser does not support the audio element.\n", + " \n", + " " ], - "source": [ - "from tqdm.auto import tqdm\n", - "\n", - "# we will use batches of 64\n", - "batch_size = 64\n", - "\n", - "for i in tqdm(range(0, len(audios), batch_size)):\n", - " # find end of batch\n", - " i_end = min(i+batch_size, len(audios))\n", - " # extract batch\n", - " batch = audios[i:i_end]\n", - " # generate embeddings for all the audios in the batch\n", - " _, emb = model.inference(batch)\n", - " # create unique IDs\n", - " ids = [f\"{idx}\" for idx in range(i, i_end)]\n", - " # add all to upsert list\n", - " to_upsert = list(zip(ids, emb.tolist()))\n", - " # upsert/insert these records to pinecone\n", - " _ = index.upsert(vectors=to_upsert)\n", - "\n", - "# check that we have all vectors in index\n", - "index.describe_index_stats()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "LG-Wg970DKKh" - }, - "source": [ - "We now have *2000* audio records indexed in Pinecone, we're ready to begin querying." + "text/plain": [ + "" ] + }, + "metadata": {}, + "output_type": "display_data" }, { - "cell_type": "markdown", - "metadata": { - "id": "x3BvGOl6TI_Z" - }, - "source": [ - "# Querying" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Result:\n" + ] }, { - "cell_type": "markdown", - "metadata": { - "id": "e3UOXzJ-DZvo" - }, - "source": [ - "Let's first listen to an audio from our dataset. We will generate embeddings for the audio and use it to find similar audios from the Pinecone index." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 93 - }, - "id": "bwEETtYuDYEZ", - "outputId": "f556e117-f471-4c4d-bbf7-64428b16bf30" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Query Audio: car_horn\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } + "data": { + "text/html": [ + "\n", + " \n", + " " ], - "source": [ - "from IPython.display import Audio, display\n", - "\n", - "# we set an audio number to select from the dataset\n", - "audio_num = 400\n", - "# get the audio data of the audio number\n", - "query_audio = data[audio_num][\"audio\"][\"array\"]\n", - "# get the category of the audio number\n", - "category = data[audio_num][\"category\"]\n", - "# print the category and play the audio\n", - "print(\"Query Audio:\", category)\n", - "Audio(query_audio, rate=44100)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "HPxbFSqwFE9R" - }, - "source": [ - "We have got the sound of a car horn. Let's generate an embedding for this sound." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "jD1qPLRjFzG0", - "outputId": "948ac450-28ea-4d8c-c99e-4bbaa5d8ef50" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "(1, 2048)" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " ], - "source": [ - "# reshape query audio\n", - "query_audio = query_audio[None, :]\n", - "# get the embeddings for the audio from the model\n", - "_, xq = model.inference(query_audio)\n", - "xq.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "42V_53GcGaKB" - }, - "source": [ - "We have now converted the audio into a 2048-dimension vector the same way we did for all the other audio we indexed. Let's use this to query our Pinecone index." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "65-In6BEGUw6", - "outputId": "b2550582-38a7-4067-f1a7-fa9e956b0cab" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'matches': [{'id': '400', 'score': 1.0, 'values': []},\n", - " {'id': '1667', 'score': 0.842124522, 'values': []},\n", - " {'id': '1666', 'score': 0.831768811, 'values': []}],\n", - " 'namespace': ''}" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " ], - "source": [ - "# query pinecone index with the query audio embeddings\n", - "results = index.query(vector=xq.tolist(), top_k=3)\n", - "results" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "q5zJCjxpITgH" - }, - "source": [ - "Notice that the top result is the audio number 400 from our dataset, which is our query audio (the most similar item should always be the query itself). Let's listen to the top three results. " - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 193 - }, - "id": "mVk2BGiVI1_a", - "outputId": "0460c9b4-0496-4029-f724-0a442b3bbb6b" - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " ], - "source": [ - "# play the top 3 similar audios\n", - "for r in results[\"matches\"]:\n", - " # select the audio data from the databse using the id as an index\n", - " a = data[int(r[\"id\"])][\"audio\"][\"array\"]\n", - " display(Audio(a, rate=44100))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tKo2HJp1JwGW" - }, - "source": [ - "We have great results, everything aligns with what seems to be a busy city street with car horns.\n", - "\n", - "Let's write a helper function to run the queries using audio from our dataset easily. We do not need to embed these audio samples again as we have already, they are just stored in Pinecone. So, we specify the `id` of the query audio to search with and tell Pinecone to search with that." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "id": "Gshw-pX8Wbrn" - }, - "outputs": [], - "source": [ - "def find_similar_audios(id):\n", - " print(\"Query Audio:\")\n", - " # select the audio data from the databse using the id as an index\n", - " query_audio = data[id][\"audio\"][\"array\"]\n", - " # play the query audio\n", - " display(Audio(query_audio, rate=44100))\n", - " # query pinecone index with the query audio id\n", - " result = index.query(id=str(id), top_k=5)\n", - " print(\"Result:\")\n", - " # play the top 5 similar audios\n", - " for r in result[\"matches\"]:\n", - " a = data[int(r[\"id\"])][\"audio\"][\"array\"]\n", - " display(Audio(a, rate=44100))" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 403 - }, - "id": "1Pxkj0JnQgt7", - "outputId": "752494e7-9354-4007-d603-22d7119b45d3" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Query Audio:\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Result:\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " ], - "source": [ - "find_similar_audios(1642)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here we return a set of revving motors (they seem to either be vehicles or lawnmowers)." + "text/plain": [ + "" ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 403 - }, - "id": "jVTpCS-nbCEw", - "outputId": "39d6a32a-ee6e-4d74-c038-b97af0e3829d" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Query Audio:\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Result:\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "find_similar_audios(1642)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here we return a set of revving motors (they seem to either be vehicles or lawnmowers)." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 403 + }, + "id": "jVTpCS-nbCEw", + "outputId": "39d6a32a-ee6e-4d74-c038-b97af0e3829d" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Query Audio:\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " ], - "source": [ - "find_similar_audios(452)" + "text/plain": [ + "" ] + }, + "metadata": {}, + "output_type": "display_data" }, { - "cell_type": "markdown", - "metadata": { - "id": "WiL-cw_LRvXy" - }, - "source": [ - "And now a more relaxing set of birds chirping in nature.\n", - "\n", - "Let's use another audio sample from elsewhere (eg not this dataset) and see how the search performs with this." - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Result:\n" + ] }, { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "btsH4EOkbCHe", - "outputId": "8d40a7cf-171f-45a6-9953-2ecdbfaf37af" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "--2022-09-25 20:47:00-- https://storage.googleapis.com/audioset/miaow_16k.wav\n", - "Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.20.128, 108.177.98.128, 74.125.197.128, ...\n", - "Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.20.128|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 215546 (210K) [audio/x-wav]\n", - "Saving to: \u2018miaow_16k.wav.1\u2019\n", - "\n", - "\rmiaow_16k.wav.1 0%[ ] 0 --.-KB/s \rmiaow_16k.wav.1 100%[===================>] 210.49K --.-KB/s in 0.004s \n", - "\n", - "2022-09-25 20:47:00 (54.1 MB/s) - \u2018miaow_16k.wav.1\u2019 saved [215546/215546]\n", - "\n" - ] - } + "data": { + "text/html": [ + "\n", + " \n", + " " ], - "source": [ - "!wget https://storage.googleapis.com/audioset/miaow_16k.wav" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JJRZUWCDR73v" - }, - "source": [ - "We can load the audio into a Numpy array as follows:" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 76 - }, - "id": "JZHilxkISLJb", - "outputId": "80aad9ff-b33d-4522-ed98-23b1bf19559f" - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " ], - "source": [ - "import librosa\n", - "\n", - "a, _ = librosa.load(\"miaow_16k.wav\", sr=44100)\n", - "Audio(a, rate=44100)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "mKLknBWQV3Jm" - }, - "source": [ - "Now we generate the embeddings for this audio and query the Pinecone index." - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 193 - }, - "id": "vWKRAB5rSy9m", - "outputId": "dcc856fb-4e90-46ee-f980-d46766556726" - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " ], - "source": [ - "# reshape query audio\n", - "query_audio = a[None, :]\n", - "# get the embeddings for the audio from the model\n", - "_, xq = model.inference(query_audio)\n", - "\n", - "# query pinecone index with the query audio embeddings\n", - "results = index.query(vector=xq.tolist(), top_k=3)\n", - "\n", - "# play the top 3 similar audios\n", - "for r in results[\"matches\"]:\n", - " a = data[int(r[\"id\"])][\"audio\"][\"array\"]\n", - " display(Audio(a, rate=44100))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "g2Ex_gk8Vmee" - }, - "source": [ - "Our audio search application has identified a set of similar cat sounds, which is excellent." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_uNGu-zUTtW6" - }, - "source": [ - "# Delete the Index" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ltMgxnuDTvp6" - }, - "source": [ - "Delete the index once you are sure that you do not want to use it anymore. Once the index is deleted, you cannot use it again." - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "id": "voohGLfrUxHf" - }, - "outputs": [], - "source": [ - "pc.delete_index(index_name)" + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" ] + }, + "metadata": {}, + "output_type": "display_data" } - ], - "metadata": { - "accelerator": "GPU", + ], + "source": [ + "find_similar_audios(452)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WiL-cw_LRvXy" + }, + "source": [ + "And now a more relaxing set of birds chirping in nature.\n", + "\n", + "Let's use another audio sample from elsewhere (eg not this dataset) and see how the search performs with this." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { "colab": { - "collapsed_sections": [], - "provenance": [] + "base_uri": "https://localhost:8080/" }, - "kernelspec": { - "display_name": "Python 3.9.13 64-bit", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.10.7 (main, Sep 14 2022, 22:38:23) [Clang 14.0.0 (clang-1400.0.29.102)]" - }, - "vscode": { - "interpreter": { - "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e" - } + "id": "btsH4EOkbCHe", + "outputId": "8d40a7cf-171f-45a6-9953-2ecdbfaf37af" + }, + "outputs": [], + "source": "# Download the audio file\nurl = \"https://storage.googleapis.com/audioset/miaow_16k.wav\"\nfilename = \"miaow_16k.wav\"\nurllib.request.urlretrieve(url, filename)\nprint(f\"Downloaded {filename}\")" + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JJRZUWCDR73v" + }, + "source": [ + "We can load the audio into a Numpy array as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 76 + }, + "id": "JZHilxkISLJb", + "outputId": "80aad9ff-b33d-4522-ed98-23b1bf19559f" + }, + "outputs": [], + "source": "a, _ = librosa.load(\"miaow_16k.wav\", sr=44100)\nAudio(a, rate=44100)" + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mKLknBWQV3Jm" + }, + "source": [ + "Now we generate the embeddings for this audio and query the Pinecone index." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 193 + }, + "id": "vWKRAB5rSy9m", + "outputId": "dcc856fb-4e90-46ee-f980-d46766556726" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" } + ], + "source": [ + "# reshape query audio\n", + "query_audio = a[None, :]\n", + "# get the embeddings for the audio from the model\n", + "_, xq = model.inference(query_audio)\n", + "\n", + "# query pinecone index with the query audio embeddings\n", + "results = index.query(vector=xq.tolist(), top_k=3)\n", + "\n", + "# play the top 3 similar audios\n", + "for r in results[\"matches\"]:\n", + " a = data[int(r[\"id\"])][\"audio\"][\"array\"]\n", + " display(Audio(a, rate=44100))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "g2Ex_gk8Vmee" + }, + "source": [ + "Our audio search application has identified a set of similar cat sounds, which is excellent." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_uNGu-zUTtW6" + }, + "source": [ + "# Delete the Index" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ltMgxnuDTvp6" + }, + "source": [ + "Delete the index once you are sure that you do not want to use it anymore. Once the index is deleted, you cannot use it again." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "id": "voohGLfrUxHf" + }, + "outputs": [], + "source": [ + "pc.delete_index(index_name)" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3.9.13 64-bit", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10.7 (main, Sep 14 2022, 22:38:23) [Clang 14.0.0 (clang-1400.0.29.102)]" }, - "nbformat": 4, - "nbformat_minor": 0 + "vscode": { + "interpreter": { + "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e" + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 } \ No newline at end of file