Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 9 additions & 6 deletions python/pyspark/sql/connect/client/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -1667,13 +1667,16 @@ def handle_response(
for batch in reader:
assert isinstance(batch, pa.RecordBatch)
num_records_in_batch += batch.num_rows
if num_records_in_batch != b.arrow_batch.row_count:
raise SparkConnectException(
f"Expected {b.arrow_batch.row_count} rows in arrow batch but "
+ f"got {num_records_in_batch}."
)
num_records += num_records_in_batch
num_records += batch.num_rows
yield batch
# An Arrow IPC stream ([Schema][RecordBatch]*[EOS]) may carry
# multiple RecordBatches, so validate row_count only once the
# reader is fully consumed.
if num_records_in_batch != b.arrow_batch.row_count:
raise SparkConnectException(
f"Expected {b.arrow_batch.row_count} rows in arrow batch but "
+ f"got {num_records_in_batch}."
)
if b.HasField("create_resource_profile_command_result"):
profile_id = b.create_resource_profile_command_result.profile_id
yield {"create_resource_profile_command_result": profile_id}
Expand Down
38 changes: 38 additions & 0 deletions python/pyspark/sql/tests/connect/client/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,44 @@ def test_user_agent_default(self):
mock.req.client_type, r"^_SPARK_CONNECT_PYTHON spark/[^ ]+ os/[^ ]+ python/[^ ]+$"
)

def test_multiple_record_batches_in_single_arrow_batch(self):
# An Arrow IPC stream may carry multiple RecordBatches; row_count is the total
# across them and must be validated only after the stream is fully consumed.
client = SparkConnectClient("sc://foo/", use_reattachable_execute=False)

class MultiBatchMockService:
def __init__(self, session_id: str):
self._session_id = session_id
self.req: Optional[proto.ExecutePlanRequest] = None

def ExecutePlan(self, req: proto.ExecutePlanRequest, metadata):
self.req = req
resp = proto.ExecutePlanResponse()
resp.session_id = self._session_id
resp.operation_id = req.operation_id

pdf = pd.DataFrame(data={"col1": [1, 2, 3, 4]})
schema = pa.Schema.from_pandas(pdf)
table = pa.Table.from_pandas(pdf)
sink = pa.BufferOutputStream()
writer = pa.ipc.new_stream(sink, schema=schema)
# Two RecordBatches in one IPC stream.
for batch in table.to_batches(max_chunksize=2):
writer.write_batch(batch)
writer.close()

resp.arrow_batch.data = sink.getvalue().to_pybytes()
# row_count is the total across all RecordBatches in the message.
resp.arrow_batch.row_count = 4
return [resp]

mock = MultiBatchMockService(client._session_id)
client._stub = mock

plan = proto.Plan()
table, _, _ = client.to_table(plan, {})
self.assertEqual(table.num_rows, 4)

def test_properties(self):
client = SparkConnectClient("sc://foo/;token=bar", use_reattachable_execute=False)
self.assertEqual(client.token, "bar")
Expand Down