diff --git a/docs/pinecone-reranker.ipynb b/docs/pinecone-reranker.ipynb index f6d44ffb..1637fde8 100644 --- a/docs/pinecone-reranker.ipynb +++ b/docs/pinecone-reranker.ipynb @@ -54,7 +54,7 @@ "10. **Rerank Results**\n", "\n", "\n", - "The main dataset we will be using consists of randomly generated doctor’s notes sample data. The original JSON data has been embedded into vectors, which we will load into Pinecone.\n" + "The main dataset we will be using consists of randomly generated doctor\u2019s notes sample data. The original JSON data has been embedded into vectors, which we will load into Pinecone.\n" ] }, { @@ -66,8 +66,8 @@ }, "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "Requirement already satisfied: pinecone==6.0.1 in /opt/conda/lib/python3.12/site-packages (6.0.1)\n", "Requirement already satisfied: certifi>=2019.11.17 in /opt/conda/lib/python3.12/site-packages (from pinecone==6.0.1) (2025.1.31)\n", @@ -82,8 +82,7 @@ ], "source": [ "# Installation\n", - "!pip install -U pinecone==6.0.1\n", - "!pip install -U pinecone-notebooks" + "!pip install -qU pinecone==8.0.0 pinecone-notebooks" ] }, { @@ -99,6 +98,7 @@ "\n", "if not os.environ.get(\"PINECONE_API_KEY\"):\n", " from pinecone_notebooks.colab import Authenticate\n", + "\n", " Authenticate()" ] }, @@ -111,8 +111,8 @@ }, "outputs": [ { - "name": "stderr", "output_type": "stream", + "name": "stdout", "text": [ "/opt/conda/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" @@ -144,7 +144,7 @@ " \"Apple is known for its innovative products like the iPhone.\",\n", " \"Many people enjoy eating apples as a healthy snack.\",\n", " \"Apple Inc. has revolutionized the tech industry with its sleek designs and user-friendly interfaces.\",\n", - " \"An apple a day keeps the doctor away, as the saying goes.\"\n", + " \"An apple a day keeps the doctor away, as the saying goes.\",\n", "]" ] }, @@ -166,9 +166,9 @@ " 'An apple a day keeps the doctor away, as the saying goes.']" ] }, - "execution_count": 5, + "output_type": "execute_result", "metadata": {}, - "output_type": "execute_result" + "execution_count": 5 } ], "source": [ @@ -192,7 +192,7 @@ " query=query,\n", " documents=documents,\n", " top_n=3,\n", - " return_documents=True\n", + " return_documents=True,\n", ")" ] }, @@ -205,8 +205,8 @@ }, "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "Query: Tell me about Apple's products\n", "Reranked Results:\n", @@ -225,12 +225,13 @@ "source": [ "def show_reranked_results(query, matches):\n", " \"\"\"A utility function to print our reranked results\"\"\"\n", - " print(f'Query: {query}')\n", - " print('Reranked Results:')\n", + " print(f\"Query: {query}\")\n", + " print(\"Reranked Results:\")\n", " for i, match in enumerate(matches):\n", - " print(f'{str(i+1).rjust(4)}. Score: {match.score}')\n", - " print(f' Document: {match.document.text}')\n", - " print('')\n", + " print(f\"{str(i + 1).rjust(4)}. Score: {match.score}\")\n", + " print(f\" Document: {match.document.text}\")\n", + " print(\"\")\n", + "\n", "\n", "# Note the reranker ranks Apple the company over apple the fruit based on the context of the query\n", "show_reranked_results(query, reranked_results.data)" @@ -256,8 +257,8 @@ "metadata": {}, "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "Requirement already satisfied: pandas in /opt/conda/lib/python3.12/site-packages (2.2.3)\n", "Requirement already satisfied: torch in /opt/conda/lib/python3.12/site-packages (2.6.0)\n", @@ -306,10 +307,11 @@ "source": [ "import os\n", "import time\n", + "\n", "import pandas as pd\n", + "import torch\n", "from pinecone import Pinecone, ServerlessSpec\n", - "from transformers import AutoTokenizer, AutoModel\n", - "import torch" + "from transformers import AutoModel, AutoTokenizer" ] }, { @@ -322,14 +324,14 @@ "outputs": [], "source": [ "# Get cloud and region settings\n", - "cloud = os.getenv('PINECONE_CLOUD', 'aws')\n", - "region = os.getenv('PINECONE_REGION', 'us-east-1')\n", + "cloud = os.getenv(\"PINECONE_CLOUD\", \"aws\")\n", + "region = os.getenv(\"PINECONE_REGION\", \"us-east-1\")\n", "\n", "# Define serverless specifications\n", "spec = ServerlessSpec(cloud=cloud, region=region)\n", "\n", "# Define index name\n", - "index_name = 'pinecone-reranker'" + "index_name = \"pinecone-reranker\"" ] }, { @@ -364,9 +366,9 @@ "}" ] }, - "execution_count": 11, + "output_type": "execute_result", "metadata": {}, - "output_type": "execute_result" + "execution_count": 11 } ], "source": [ @@ -374,12 +376,7 @@ " pc.delete_index(name=index_name)\n", "\n", "# Create a new index\n", - "pc.create_index(\n", - " name=index_name, \n", - " dimension=384, \n", - " metric='cosine', \n", - " spec=spec\n", - ")" + "pc.create_index(name=index_name, dimension=384, metric=\"cosine\", spec=spec)" ] }, { @@ -477,29 +474,30 @@ "4 {'referral': 'dermatology', 'condition': 'susp... " ] }, - "execution_count": 13, + "output_type": "execute_result", "metadata": {}, - "output_type": "execute_result" + "execution_count": 13 } ], "source": [ - "import requests\n", "import tempfile\n", "\n", + "import requests\n", + "\n", "with tempfile.TemporaryDirectory() as tmpdirname:\n", " # Construct the full path for the file within the temporary directory.\n", " file_path = os.path.join(tmpdirname, \"sample_notes_data.jsonl\")\n", - " \n", + "\n", " # Download the file from github\n", " url = \"https://raw.githubusercontent.com/pinecone-io/examples/refs/heads/master/docs/data/sample_notes_data.jsonl\"\n", " response = requests.get(url)\n", - " response.raise_for_status() # Raise an exception for any HTTP errors.\n", - " \n", + " response.raise_for_status() # Raise an exception for any HTTP errors.\n", + "\n", " # Write the file content to the temporary directory.\n", " with open(file_path, \"wb\") as f:\n", " f.write(response.content)\n", "\n", - " df = pd.read_json(file_path, orient='records', lines=True)\n", + " df = pd.read_json(file_path, orient=\"records\", lines=True)\n", "\n", "# Show head of the DataFrame\n", "df.head()" @@ -524,10 +522,10 @@ }, "outputs": [ { - "name": "stderr", "output_type": "stream", + "name": "stdout", "text": [ - "sending upsert requests: 100%|██████████| 100/100 [00:00<00:00, 200.29it/s]\n" + "sending upsert requests: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 100/100 [00:00<00:00, 200.29it/s]\n" ] }, { @@ -536,9 +534,9 @@ "{'upserted_count': 100}" ] }, - "execution_count": 14, + "output_type": "execute_result", "metadata": {}, - "output_type": "execute_result" + "execution_count": 14 } ], "source": [ @@ -558,8 +556,8 @@ }, "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "Vector count: 0\n", "Vector count: 0\n", @@ -584,20 +582,19 @@ " 'vector_type': 'dense'}" ] }, - "execution_count": 15, + "output_type": "execute_result", "metadata": {}, - "output_type": "execute_result" + "execution_count": 15 } ], "source": [ - "import time\n", - "\n", "def is_fresh(index):\n", " stats = index.describe_index_stats()\n", " vector_count = stats.total_vector_count\n", - " print(f\"Vector count: \", vector_count)\n", + " print(\"Vector count: \", vector_count)\n", " return vector_count > 0\n", "\n", + "\n", "while not is_fresh(index):\n", " # It takes a few moments for vectors we just upserted\n", " # to become available for querying\n", @@ -627,11 +624,13 @@ "outputs": [], "source": [ "def get_embedding(input_question):\n", - " model_name = 'sentence-transformers/all-MiniLM-L6-v2' # HuggingFace Model\n", + " model_name = \"sentence-transformers/all-MiniLM-L6-v2\" # HuggingFace Model\n", " tokenizer = AutoTokenizer.from_pretrained(model_name)\n", " model = AutoModel.from_pretrained(model_name)\n", "\n", - " encoded_input = tokenizer(input_question, padding=True, truncation=True, return_tensors='pt')\n", + " encoded_input = tokenizer(\n", + " input_question, padding=True, truncation=True, return_tensors=\"pt\"\n", + " )\n", "\n", " with torch.no_grad():\n", " model_output = model(**encoded_input)\n", @@ -659,8 +658,8 @@ }, "outputs": [ { - "name": "stderr", "output_type": "stream", + "name": "stdout", "text": [ "/pytorch/third_party/ideep/mkl-dnn/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h, 451: Can't read MIDR_EL1 sysfs entry\n" ] @@ -675,7 +674,7 @@ "results = index.query(vector=[query], top_k=10, include_metadata=True)\n", "\n", "# Sort results by score in descending order\n", - "sorted_matches = sorted(results['matches'], key=lambda x: x['score'], reverse=True)" + "sorted_matches = sorted(results[\"matches\"], key=lambda x: x[\"score\"], reverse=True)" ] }, { @@ -699,13 +698,13 @@ "source": [ "def show_results(question, matches):\n", " \"\"\"A utility function to print our results\"\"\"\n", - " print(f'Question: \\'{question}\\'')\n", - " print('\\nResults:')\n", + " print(f\"Question: '{question}'\")\n", + " print(\"\\nResults:\")\n", " for i, match in enumerate(matches):\n", - " print(f'{str(i+1).rjust(4)}. ID: {match[\"id\"]}')\n", - " print(f' Score: {match[\"score\"]}')\n", - " print(f' Metadata: {match[\"metadata\"]}')\n", - " print('')" + " print(f\"{str(i + 1).rjust(4)}. ID: {match['id']}\")\n", + " print(f\" Score: {match['score']}\")\n", + " print(f\" Metadata: {match['metadata']}\")\n", + " print(\"\")" ] }, { @@ -715,8 +714,8 @@ "metadata": {}, "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "Question: 'what if my patient has leg pain'\n", "\n", @@ -790,10 +789,12 @@ "# Create documents with concatenated metadata field as \"reranking_field\" field\n", "transformed_documents = [\n", " {\n", - " 'id': match['id'],\n", - " 'reranking_field': '; '.join([f\"{key}: {value}\" for key, value in match['metadata'].items()])\n", + " \"id\": match[\"id\"],\n", + " \"reranking_field\": \"; \".join(\n", + " [f\"{key}: {value}\" for key, value in match[\"metadata\"].items()]\n", + " ),\n", " }\n", - " for match in results['matches']\n", + " for match in results[\"matches\"]\n", "]" ] }, @@ -829,8 +830,8 @@ }, "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "Question: 'what if my patient had knee surgery'\n", "\n", @@ -849,13 +850,14 @@ "source": [ "def show_reranked_results(question, matches):\n", " \"\"\"A utility function to print our reranked results\"\"\"\n", - " print(f'Question: \\'{question}\\'')\n", - " print('\\nReranked Results:')\n", + " print(f\"Question: '{question}'\")\n", + " print(\"\\nReranked Results:\")\n", " for i, match in enumerate(matches):\n", - " print(f'{str(i+1).rjust(4)}. ID: {match.document.id}')\n", - " print(f' Score: {match.score}')\n", - " print(f' Reranking Field: {match.document.reranking_field}')\n", - " print('')\n", + " print(f\"{str(i + 1).rjust(4)}. ID: {match.document.id}\")\n", + " print(f\" Score: {match.score}\")\n", + " print(f\" Reranking Field: {match.document.reranking_field}\")\n", + " print(\"\")\n", + "\n", "\n", "show_reranked_results(query, reranked_results_field.data)" ] @@ -907,4 +909,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file