|
| 1 | +# Licensed to the Apache Software Foundation (ASF) under one |
| 2 | +# or more contributor license agreements. See the NOTICE file |
| 3 | +# distributed with this work for additional information |
| 4 | +# regarding copyright ownership. The ASF licenses this file |
| 5 | +# to you under the Apache License, Version 2.0 (the |
| 6 | +# "License"); you may not use this file except in compliance |
| 7 | +# with the License. You may obtain a copy of the License at |
| 8 | +# |
| 9 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | +# |
| 11 | +# Unless required by applicable law or agreed to in writing, |
| 12 | +# software distributed under the License is distributed on an |
| 13 | +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 14 | +# KIND, either express or implied. See the License for the |
| 15 | +# specific language governing permissions and limitations |
| 16 | +# under the License. |
| 17 | + |
| 18 | +"""Hadoop Delegation Token Service (HDTS) file parser. |
| 19 | +
|
| 20 | +Reads delegation tokens from the binary token file pointed to by |
| 21 | +the ``$HADOOP_TOKEN_FILE_LOCATION`` environment variable. |
| 22 | +""" |
| 23 | + |
| 24 | +from __future__ import annotations |
| 25 | + |
| 26 | +import base64 |
| 27 | +import os |
| 28 | +from io import BytesIO |
| 29 | + |
| 30 | +from pyiceberg.exceptions import HiveAuthError |
| 31 | + |
| 32 | +HADOOP_TOKEN_FILE_LOCATION = "HADOOP_TOKEN_FILE_LOCATION" |
| 33 | +HIVE_DELEGATION_TOKEN_KIND = "HIVE_DELEGATION_TOKEN" |
| 34 | +HDTS_MAGIC = b"HDTS" |
| 35 | +HDTS_SUPPORTED_VERSION = 0 |
| 36 | + |
| 37 | + |
| 38 | +def _read_hadoop_vint(stream: BytesIO) -> int: |
| 39 | + """Decode a Hadoop WritableUtils VInt/VLong from a byte stream.""" |
| 40 | + first = stream.read(1) |
| 41 | + if not first: |
| 42 | + raise HiveAuthError("Unexpected end of token file while reading VInt") |
| 43 | + b = first[0] |
| 44 | + if b <= 0x7F: |
| 45 | + return b |
| 46 | + # Number of additional bytes is encoded in leading 1-bits |
| 47 | + num_extra = 0 |
| 48 | + mask = 0x80 |
| 49 | + while b & mask: |
| 50 | + num_extra += 1 |
| 51 | + mask >>= 1 |
| 52 | + # First byte contributes the remaining bits |
| 53 | + result = b & (mask - 1) |
| 54 | + extra = stream.read(num_extra) |
| 55 | + if len(extra) != num_extra: |
| 56 | + raise HiveAuthError("Unexpected end of token file while reading VInt") |
| 57 | + for byte in extra: |
| 58 | + result = (result << 8) | byte |
| 59 | + # Sign-extend if negative (high bit of decoded value is set) |
| 60 | + if result >= (1 << (8 * num_extra + (8 - num_extra - 1) - 1)): |
| 61 | + result -= 1 << (8 * num_extra + (8 - num_extra - 1)) |
| 62 | + return result |
| 63 | + |
| 64 | + |
| 65 | +def _read_hadoop_bytes(stream: BytesIO) -> bytes: |
| 66 | + """Read a VInt-prefixed byte array from a Hadoop token stream.""" |
| 67 | + length = _read_hadoop_vint(stream) |
| 68 | + if length < 0: |
| 69 | + raise HiveAuthError(f"Invalid byte array length: {length}") |
| 70 | + data = stream.read(length) |
| 71 | + if len(data) != length: |
| 72 | + raise HiveAuthError("Unexpected end of token file while reading byte array") |
| 73 | + return data |
| 74 | + |
| 75 | + |
| 76 | +def _read_hadoop_text(stream: BytesIO) -> str: |
| 77 | + """Read a VInt-prefixed UTF-8 string from a Hadoop token stream.""" |
| 78 | + return _read_hadoop_bytes(stream).decode("utf-8") |
| 79 | + |
| 80 | + |
| 81 | +def read_hive_delegation_token() -> tuple[str, str]: |
| 82 | + """Read a Hive delegation token from ``$HADOOP_TOKEN_FILE_LOCATION``. |
| 83 | +
|
| 84 | + Returns: |
| 85 | + A ``(identifier, password)`` tuple where both values are |
| 86 | + base64-encoded strings suitable for SASL DIGEST-MD5 auth. |
| 87 | +
|
| 88 | + Raises: |
| 89 | + HiveAuthError: If the token file is missing, malformed, or |
| 90 | + does not contain a ``HIVE_DELEGATION_TOKEN``. |
| 91 | + """ |
| 92 | + token_file = os.environ.get(HADOOP_TOKEN_FILE_LOCATION) |
| 93 | + if not token_file: |
| 94 | + raise HiveAuthError( |
| 95 | + f"${HADOOP_TOKEN_FILE_LOCATION} environment variable is not set. " |
| 96 | + "A Hadoop delegation token file is required for DIGEST-MD5 authentication." |
| 97 | + ) |
| 98 | + |
| 99 | + try: |
| 100 | + with open(token_file, "rb") as f: |
| 101 | + data = f.read() |
| 102 | + except FileNotFoundError: |
| 103 | + raise HiveAuthError(f"Hadoop token file not found: {token_file}") |
| 104 | + |
| 105 | + stream = BytesIO(data) |
| 106 | + |
| 107 | + magic = stream.read(4) |
| 108 | + if magic != HDTS_MAGIC: |
| 109 | + raise HiveAuthError(f"Invalid Hadoop token file magic: expected {HDTS_MAGIC!r}, got {magic!r}") |
| 110 | + |
| 111 | + version_byte = stream.read(1) |
| 112 | + if not version_byte: |
| 113 | + raise HiveAuthError("Unexpected end of token file while reading version") |
| 114 | + version = version_byte[0] |
| 115 | + if version != HDTS_SUPPORTED_VERSION: |
| 116 | + raise HiveAuthError(f"Unsupported Hadoop token file version: {version}") |
| 117 | + |
| 118 | + num_tokens = _read_hadoop_vint(stream) |
| 119 | + |
| 120 | + for _ in range(num_tokens): |
| 121 | + # Each token entry: identifier_bytes, password_bytes, kind_text, service_text |
| 122 | + identifier_bytes = _read_hadoop_bytes(stream) |
| 123 | + password_bytes = _read_hadoop_bytes(stream) |
| 124 | + kind = _read_hadoop_text(stream) |
| 125 | + _service = _read_hadoop_text(stream) |
| 126 | + |
| 127 | + if kind == HIVE_DELEGATION_TOKEN_KIND: |
| 128 | + return ( |
| 129 | + base64.b64encode(identifier_bytes).decode("ascii"), |
| 130 | + base64.b64encode(password_bytes).decode("ascii"), |
| 131 | + ) |
| 132 | + |
| 133 | + raise HiveAuthError( |
| 134 | + f"No {HIVE_DELEGATION_TOKEN_KIND} found in token file: {token_file}. " |
| 135 | + f"File contains {num_tokens} token(s)." |
| 136 | + ) |
0 commit comments