From 51af2ecee85e132fe54529788a4e563f01e72450 Mon Sep 17 00:00:00 2001 From: Jen Hamon Date: Thu, 29 Jan 2026 11:05:25 -0500 Subject: [PATCH] fix(learn): modernize semantic-search.ipynb to Pinecone SDK v8 --- .../semantic-search/semantic-search.ipynb | 228 ++++++++---------- 1 file changed, 104 insertions(+), 124 deletions(-) diff --git a/learn/search/semantic-search/semantic-search.ipynb b/learn/search/semantic-search/semantic-search.ipynb index 798c15b5..f2af6a89 100644 --- a/learn/search/semantic-search/semantic-search.ipynb +++ b/learn/search/semantic-search/semantic-search.ipynb @@ -17,18 +17,35 @@ }, { "cell_type": "code", - "execution_count": null, "metadata": { "id": "q03L1BYEZQfe" }, - "outputs": [], "source": [ - "!pip install -qU \\\n", - " \"pinecone-client[grpc]\"==5.0.1 \\\n", - " datasets==2.14.6 \\\n", - " sentence-transformers==2.2.2 \\\n", - " pinecone-plugin-inference" - ] + "!pip install -qU pinecone>=8.0.0 datasets==2.14.6 sentence-transformers==2.2.2 pinecone-plugin-inference>=1.0.0\n", + "\n", + "import os\n", + "import time\n", + "from getpass import getpass\n", + "\n", + "from datasets import load_dataset\n", + "from pinecone import Pinecone, ServerlessSpec\n", + "from tqdm.auto import tqdm\n", + "\n", + "api_key = os.environ.get(\"PINECONE_API_KEY\") or getpass(\"Enter your Pinecone API key: \")\n", + "pc = Pinecone(api_key=api_key)\n", + "model_name = \"multilingual-e5-large\"" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Setup complete. Connection and model name are ready for the rest of the notebook." + ], + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", @@ -74,17 +91,15 @@ }, { "cell_type": "code", - "execution_count": null, "metadata": { "id": "IeJPWu9P7EtR" }, - "outputs": [], "source": [ - "from datasets import load_dataset\n", - "\n", - "dataset = load_dataset('quora', split='train[240000:290000]')\n", + "dataset = load_dataset(\"quora\", split=\"train[240000:290000]\")\n", "dataset" - ] + ], + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", @@ -97,14 +112,14 @@ }, { "cell_type": "code", - "execution_count": null, "metadata": { "id": "CsA67WpW7El4" }, - "outputs": [], "source": [ "dataset[:5]" - ] + ], + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", @@ -117,22 +132,22 @@ }, { "cell_type": "code", - "execution_count": null, "metadata": { "id": "heGUpy_37Eis" }, - "outputs": [], "source": [ "questions = []\n", "\n", - "for record in dataset['questions']:\n", - " questions.extend(record['text'])\n", + "for record in dataset[\"questions\"]:\n", + " questions.extend(record[\"text\"])\n", "\n", "# remove duplicates\n", "questions = list(set(questions))\n", - "print('\\n'.join(questions[:5]))\n", + "print(\"\\n\".join(questions[:5]))\n", "print(len(questions))" - ] + ], + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", @@ -149,32 +164,21 @@ }, { "cell_type": "code", - "execution_count": null, "metadata": { "id": "uxcGjb9GSEqA" }, - "outputs": [], "source": [ - "import os\n", - "from pinecone import Pinecone\n", - "\n", - "api_key = os.environ.get('PINECONE_API_KEY') or 'PINECONE_API_KEY'\n", - "pc = Pinecone(api_key=api_key)\n", - "\n", - "model_name = \"multilingual-e5-large\"\n", - "\n", - "query = 'which city is the most populated in the world?'\n", + "query = \"which city is the most populated in the world?\"\n", "\n", "xq = pc.inference.embed(\n", " model=model_name,\n", " inputs=[query],\n", - " parameters={\n", - " \"input_type\": \"query\",\n", - " \"truncate\": \"END\"\n", - " }\n", - ").data[0]['values']\n", + " parameters={\"input_type\": \"query\", \"truncate\": \"END\"},\n", + ").data[0][\"values\"]\n", "xq[:5]" - ] + ], + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", @@ -195,27 +199,24 @@ }, { "cell_type": "code", - "execution_count": null, "metadata": { "id": "Z355VLWFqumq" }, - "outputs": [], "source": [ - "query = 'which city is the most populated in the world?'\n", + "query = \"which city is the most populated in the world?\"\n", "\n", "# create the query vector\n", "xq = pc.inference.embed(\n", " model=model_name,\n", " inputs=[query],\n", - " parameters={\n", - " \"input_type\": \"query\",\n", - " \"truncate\": \"END\"\n", - " }\n", - ").data[0]['values']\n", + " parameters={\"input_type\": \"query\", \"truncate\": \"END\"},\n", + ").data[0][\"values\"]\n", "\n", "# Print the first 5 vectors as a sanity check\n", "xq[:5]" - ] + ], + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", @@ -230,17 +231,17 @@ }, { "cell_type": "code", - "execution_count": null, "metadata": { "id": "T38HdqxwVg6p" }, - "outputs": [], "source": [ - "_id = '0'\n", - "metadata = {'text': query}\n", + "_id = \"0\"\n", + "metadata = {\"text\": query}\n", "\n", "vectors = [(_id, xq, metadata)]" - ] + ], + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", @@ -266,20 +267,14 @@ }, { "cell_type": "code", - "execution_count": null, "metadata": { "id": "mc66NEBAcQHY" }, - "outputs": [], "source": [ - "import os\n", - "\n", - "# initialize connection to pinecone (get API key at app.pinecone.io)\n", - "api_key = os.environ.get('PINECONE_API_KEY') or 'PINECONE_API_KEY'\n", - "\n", - "# configure client\n", - "pc = Pinecone(api_key=api_key)" - ] + "# Pinecone connection already established above" + ], + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", @@ -287,24 +282,22 @@ "id": "hwELn6nBjoaE" }, "source": [ - "Now we setup our index specification, this allows us to define the cloud provider and region where we want to deploy our index. You can find a list of all [available providers and regions here](https://docs.pinecone.io/docs/projects)." + "Now we setup our index specification, this allows us to define the cloud provider and region where we want to deploy our index. You can find a list of all [available providers and regions here](https://docs.pinecone.io/guides/projects/understanding-projects)." ] }, { "cell_type": "code", - "execution_count": null, "metadata": { "id": "l39IjK9xjoaE" }, - "outputs": [], "source": [ - "from pinecone import ServerlessSpec\n", - "\n", - "cloud = os.environ.get('PINECONE_CLOUD') or 'aws'\n", - "region = os.environ.get('PINECONE_REGION') or 'us-east-1'\n", + "cloud = os.environ.get(\"PINECONE_CLOUD\") or \"aws\"\n", + "region = os.environ.get(\"PINECONE_REGION\") or \"us-east-1\"\n", "\n", "spec = ServerlessSpec(cloud=cloud, region=region)" - ] + ], + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", @@ -317,46 +310,44 @@ }, { "cell_type": "code", - "execution_count": null, "metadata": { "id": "nzhBBX2PjoaF", "tags": [ "parameters" ] }, - "outputs": [], "source": [ - "index_name = 'semantic-search'" - ] + "index_name = \"semantic-search\"" + ], + "execution_count": null, + "outputs": [] }, { "cell_type": "code", - "execution_count": null, "metadata": { "id": "vucJh8sgjoaF" }, - "outputs": [], "source": [ - "import time\n", - "\n", "# check if index already exists (it shouldn't if this is first time)\n", "if index_name not in pc.list_indexes().names():\n", " # if does not exist, create index\n", " pc.create_index(\n", - " index_name,\n", + " name=index_name,\n", " dimension=1024,\n", - " metric='cosine',\n", - " spec=spec\n", + " metric=\"cosine\",\n", + " spec=spec,\n", " )\n", " # wait for index to be initialized\n", - " while not pc.describe_index(index_name).status['ready']:\n", + " while not pc.describe_index(index_name).status[\"ready\"]:\n", " time.sleep(1)\n", "\n", "# connect to index\n", "index = pc.Index(index_name)\n", "# view index stats\n", "index.describe_index_stats()" - ] + ], + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", @@ -366,19 +357,15 @@ "source": [ "Now we upsert the data, we will do this in batches of `128`.\n", "\n", - "_**Note:** On Google Colab with GPU expected runtime is ~7 minutes. If using CPU this will be significantly longer. If you'd like to get this running faster refer to the [fast notebook](https://github.com/pinecone-io/examples/blob/master/search/semantic-search/semantic-search-fast.ipynb)._" + "_**Note:** On Google Colab with GPU expected runtime is ~7 minutes. If using CPU this will be significantly longer. If you'd like to get this running faster refer to the [fast notebook](https://github.com/pinecone-io/examples/blob/master/docs/semantic-search.ipynb)._" ] }, { "cell_type": "code", - "execution_count": null, "metadata": { "id": "7o6rdgU8olqd" }, - "outputs": [], "source": [ - "from tqdm.auto import tqdm\n", - "\n", "batch_size = 96\n", "vector_limit = 100000\n", "\n", @@ -386,26 +373,25 @@ "\n", "for i in tqdm(range(0, len(questions), batch_size)):\n", " # find end of batch\n", - " i_end = min(i+batch_size, len(questions))\n", + " i_end = min(i + batch_size, len(questions))\n", " # create IDs batch\n", " ids = [str(x) for x in range(i, i_end)]\n", " # create metadata batch\n", - " metadatas = [{'text': text} for text in questions[i:i_end]]\n", + " metadatas = [{\"text\": text} for text in questions[i:i_end]]\n", " # create embeddings\n", " xc = pc.inference.embed(\n", " model=model_name,\n", " inputs=questions[i:i_end],\n", - " parameters={\n", - " \"input_type\": \"passage\",\n", - " \"truncate\": \"END\"\n", - " }\n", + " parameters={\"input_type\": \"passage\", \"truncate\": \"END\"},\n", " ).data\n", - " embeddings = [item['values'] for item in xc]\n", + " embeddings = [item[\"values\"] for item in xc]\n", " # create records list for upsert\n", " records = zip(ids, embeddings, metadatas)\n", " # upsert to Pinecone\n", " index.upsert(vectors=records)" - ] + ], + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", @@ -427,11 +413,9 @@ }, { "cell_type": "code", - "execution_count": null, "metadata": { "id": "SNa4WHoJ0COt" }, - "outputs": [], "source": [ "query = \"which city has the largest population in the world?\"\n", "\n", @@ -439,16 +423,15 @@ "xq = pc.inference.embed(\n", " model=model_name,\n", " inputs=[query],\n", - " parameters={\n", - " \"input_type\": \"query\",\n", - " \"truncate\": \"END\"\n", - " }\n", - ").data[0]['values']\n", + " parameters={\"input_type\": \"query\", \"truncate\": \"END\"},\n", + ").data[0][\"values\"]\n", "\n", "# now query\n", "xc = index.query(vector=xq, top_k=5, include_metadata=True)\n", "xc" - ] + ], + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", @@ -461,15 +444,15 @@ }, { "cell_type": "code", - "execution_count": null, "metadata": { "id": "gy7isg_f-vWg" }, - "outputs": [], "source": [ - "for result in xc['matches']:\n", + "for result in xc[\"matches\"]:\n", " print(f\"{round(result['score'], 2)}: {result['metadata']['text']}\")" - ] + ], + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", @@ -482,11 +465,9 @@ }, { "cell_type": "code", - "execution_count": null, "metadata": { "id": "dJbjE-iq_yMr" }, - "outputs": [], "source": [ "query = \"which metropolis has the highest number of people?\"\n", "\n", @@ -494,17 +475,16 @@ "xq = pc.inference.embed(\n", " model=model_name,\n", " inputs=[query],\n", - " parameters={\n", - " \"input_type\": \"query\",\n", - " \"truncate\": \"END\"\n", - " }\n", - ").data[0]['values']\n", + " parameters={\"input_type\": \"query\", \"truncate\": \"END\"},\n", + ").data[0][\"values\"]\n", "\n", "# now query\n", "xc = index.query(vector=xq, top_k=5, include_metadata=True)\n", - "for result in xc['matches']:\n", + "for result in xc[\"matches\"]:\n", " print(f\"{round(result['score'], 2)}: {result['metadata']['text']}\")" - ] + ], + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", @@ -521,14 +501,14 @@ }, { "cell_type": "code", - "execution_count": null, "metadata": { "id": "-cWdeKzhAtww" }, - "outputs": [], "source": [ - "pc.delete_index(index_name)" - ] + "pc.delete_index(name=index_name)" + ], + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", @@ -558,4 +538,4 @@ }, "nbformat": 4, "nbformat_minor": 0 -} +} \ No newline at end of file