From bd15017b452a8298e8a7ec954837c5992786ed8a Mon Sep 17 00:00:00 2001 From: chinmay-bhat <12948588+chinmay-bhat@users.noreply.github.com> Date: Mon, 27 May 2024 22:55:24 +0530 Subject: [PATCH 1/9] cache manifests --- pyiceberg/catalog/__init__.py | 2 +- pyiceberg/cli/output.py | 2 +- pyiceberg/table/__init__.py | 12 ++++++------ pyiceberg/table/snapshots.py | 10 +++++++--- tests/integration/test_partitioning_key.py | 8 ++++++-- tests/integration/test_rest_manifest.py | 2 +- tests/utils/test_manifest.py | 8 ++++---- 7 files changed, 26 insertions(+), 18 deletions(-) diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py index 9a951b5c8e..f6c15698de 100644 --- a/pyiceberg/catalog/__init__.py +++ b/pyiceberg/catalog/__init__.py @@ -717,7 +717,7 @@ def purge_table(self, identifier: Union[str, Identifier]) -> None: manifest_lists_to_delete = set() manifests_to_delete: List[ManifestFile] = [] for snapshot in metadata.snapshots: - manifests_to_delete += snapshot.manifests(io) + manifests_to_delete += snapshot.manifests(io, snapshot.manifest_list) if snapshot.manifest_list is not None: manifest_lists_to_delete.add(snapshot.manifest_list) diff --git a/pyiceberg/cli/output.py b/pyiceberg/cli/output.py index 56b544c99f..e9ba377453 100644 --- a/pyiceberg/cli/output.py +++ b/pyiceberg/cli/output.py @@ -144,7 +144,7 @@ def files(self, table: Table, history: bool) -> None: manifest_list_str = f": {snapshot.manifest_list}" if snapshot.manifest_list else "" list_tree = snapshot_tree.add(f"Snapshot {snapshot.snapshot_id}, schema {snapshot.schema_id}{manifest_list_str}") - manifest_list = snapshot.manifests(io) + manifest_list = snapshot.manifests(io, manifest_list_str) for manifest in manifest_list: manifest_tree = list_tree.add(f"Manifest: {manifest.manifest_path}") for manifest_entry in manifest.fetch_manifest_entry(io, discard_deleted=False): diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 2d4b342461..7e4307f499 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -1708,7 +1708,7 @@ def plan_files(self) -> Iterable[FileScanTask]: manifests = [ manifest_file - for manifest_file in snapshot.manifests(self.io) + for manifest_file in snapshot.manifests(self.io, snapshot.manifest_list) if manifest_evaluators[manifest_file.partition_spec_id](manifest_file) ] @@ -2941,7 +2941,7 @@ def _existing_manifests(self) -> List[ManifestFile]: if previous_snapshot is None: raise ValueError(f"Snapshot could not be found: {self._parent_snapshot_id}") - for manifest in previous_snapshot.manifests(io=self._io): + for manifest in previous_snapshot.manifests(io=self._io, manifest_list=previous_snapshot.manifest_list): if manifest.has_added_files() or manifest.has_existing_files() or manifest.added_snapshot_id == self._snapshot_id: existing_manifests.append(manifest) @@ -2992,7 +2992,7 @@ def _get_entries(manifest: ManifestFile) -> List[ManifestEntry]: if entry.data_file.content == DataFileContent.DATA ] - list_of_entries = executor.map(_get_entries, previous_snapshot.manifests(self._io)) + list_of_entries = executor.map(_get_entries, previous_snapshot.manifests(self._io, previous_snapshot.manifest_list)) return list(chain(*list_of_entries)) else: return [] @@ -3384,7 +3384,7 @@ def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType: entries = [] snapshot = self._get_snapshot(snapshot_id) - for manifest in snapshot.manifests(self.tbl.io): + for manifest in snapshot.manifests(self.tbl.io, snapshot.manifest_list): for entry in manifest.fetch_manifest_entry(io=self.tbl.io): column_sizes = entry.data_file.column_sizes or {} value_counts = entry.data_file.value_counts or {} @@ -3546,7 +3546,7 @@ def update_partitions_map( partitions_map: Dict[Tuple[str, Any], Any] = {} snapshot = self._get_snapshot(snapshot_id) - for manifest in snapshot.manifests(self.tbl.io): + for manifest in snapshot.manifests(self.tbl.io, snapshot.manifest_list): for entry in manifest.fetch_manifest_entry(io=self.tbl.io): partition = entry.data_file.partition partition_record_dict = { @@ -3624,7 +3624,7 @@ def _partition_summaries_to_rows( specs = self.tbl.metadata.specs() manifests = [] if snapshot := self.tbl.metadata.current_snapshot(): - for manifest in snapshot.manifests(self.tbl.io): + for manifest in snapshot.manifests(self.tbl.io, snapshot.manifest_list): is_data_file = manifest.content == ManifestContent.DATA is_delete_file = manifest.content == ManifestContent.DELETES manifests.append({ diff --git a/pyiceberg/table/snapshots.py b/pyiceberg/table/snapshots.py index b21a0f5613..6e8096f95c 100644 --- a/pyiceberg/table/snapshots.py +++ b/pyiceberg/table/snapshots.py @@ -19,6 +19,7 @@ import time from collections import defaultdict from enum import Enum +from functools import lru_cache from typing import TYPE_CHECKING, Any, DefaultDict, Dict, Iterable, List, Mapping, Optional from pydantic import Field, PrivateAttr, model_serializer @@ -247,9 +248,12 @@ def __str__(self) -> str: result_str = f"{operation}id={self.snapshot_id}{parent_id}{schema_id}" return result_str - def manifests(self, io: FileIO) -> List[ManifestFile]: - if self.manifest_list is not None: - file = io.new_input(self.manifest_list) + @staticmethod + @lru_cache + def manifests(io: FileIO, manifest_list: str) -> List[ManifestFile]: + """Return the manifests for the given snapshot.""" + if manifest_list not in (None, ""): + file = io.new_input(manifest_list) return list(read_manifest_list(file)) return [] diff --git a/tests/integration/test_partitioning_key.py b/tests/integration/test_partitioning_key.py index 29f664909c..f45223a224 100644 --- a/tests/integration/test_partitioning_key.py +++ b/tests/integration/test_partitioning_key.py @@ -763,10 +763,14 @@ def test_partition_key( snapshot = iceberg_table.current_snapshot() assert snapshot spark_partition_for_justification = ( - snapshot.manifests(iceberg_table.io)[0].fetch_manifest_entry(iceberg_table.io)[0].data_file.partition + snapshot.manifests(iceberg_table.io, snapshot.manifest_list)[0] + .fetch_manifest_entry(iceberg_table.io)[0] + .data_file.partition ) spark_path_for_justification = ( - snapshot.manifests(iceberg_table.io)[0].fetch_manifest_entry(iceberg_table.io)[0].data_file.file_path + snapshot.manifests(iceberg_table.io, snapshot.manifest_list)[0] + .fetch_manifest_entry(iceberg_table.io)[0] + .data_file.file_path ) assert spark_partition_for_justification == expected_partition_record assert expected_hive_partition_path_slice in spark_path_for_justification diff --git a/tests/integration/test_rest_manifest.py b/tests/integration/test_rest_manifest.py index 82c41cfd93..8c4b2aaf64 100644 --- a/tests/integration/test_rest_manifest.py +++ b/tests/integration/test_rest_manifest.py @@ -75,7 +75,7 @@ def test_write_sample_manifest(table_test_all_types: Table) -> None: if test_snapshot is None: raise ValueError("Table has no current snapshot, check the docker environment") io = table_test_all_types.io - test_manifest_file = test_snapshot.manifests(io)[0] + test_manifest_file = test_snapshot.manifests(io, test_snapshot.manifest_list)[0] test_manifest_entries = test_manifest_file.fetch_manifest_entry(io) entry = test_manifest_entries[0] test_schema = table_test_all_types.schema() diff --git a/tests/utils/test_manifest.py b/tests/utils/test_manifest.py index 8bb03cd80e..df373bef13 100644 --- a/tests/utils/test_manifest.py +++ b/tests/utils/test_manifest.py @@ -217,7 +217,7 @@ def test_read_manifest_v1(generated_manifest_file_file_v1: str) -> None: summary=Summary(Operation.APPEND), schema_id=3, ) - manifest_list = snapshot.manifests(io)[0] + manifest_list = snapshot.manifests(io, snapshot.manifest_list)[0] assert manifest_list.manifest_length == 7989 assert manifest_list.partition_spec_id == 0 @@ -267,7 +267,7 @@ def test_read_manifest_v2(generated_manifest_file_file_v2: str) -> None: summary=Summary(Operation.APPEND), schema_id=3, ) - manifest_list = snapshot.manifests(io)[0] + manifest_list = snapshot.manifests(io, manifest_list=snapshot.manifest_list)[0] assert manifest_list.manifest_length == 7989 assert manifest_list.partition_spec_id == 0 @@ -319,7 +319,7 @@ def test_write_manifest( summary=Summary(Operation.APPEND), schema_id=3, ) - demo_manifest_file = snapshot.manifests(io)[0] + demo_manifest_file = snapshot.manifests(io, snapshot.manifest_list)[0] manifest_entries = demo_manifest_file.fetch_manifest_entry(io) test_schema = Schema( NestedField(1, "VendorID", IntegerType(), False), NestedField(2, "tpep_pickup_datetime", IntegerType(), False) @@ -491,7 +491,7 @@ def test_write_manifest_list( schema_id=3, ) - demo_manifest_list = snapshot.manifests(io) + demo_manifest_list = snapshot.manifests(io, snapshot.manifest_list) with TemporaryDirectory() as tmp_dir: path = tmp_dir + "/manifest-list.avro" output = io.new_output(path) From e1b1d7c38623d4c2bef03f1aab4dab3986446ef6 Mon Sep 17 00:00:00 2001 From: chinmay-bhat <12948588+chinmay-bhat@users.noreply.github.com> Date: Tue, 4 Jun 2024 12:59:11 +0530 Subject: [PATCH 2/9] update API --- pyiceberg/catalog/__init__.py | 2 +- pyiceberg/cli/output.py | 2 +- pyiceberg/table/__init__.py | 12 ++++++------ pyiceberg/table/snapshots.py | 6 +++++- tests/integration/test_partitioning_key.py | 8 ++------ tests/integration/test_rest_manifest.py | 2 +- tests/utils/test_manifest.py | 8 ++++---- 7 files changed, 20 insertions(+), 20 deletions(-) diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py index f6c15698de..9a951b5c8e 100644 --- a/pyiceberg/catalog/__init__.py +++ b/pyiceberg/catalog/__init__.py @@ -717,7 +717,7 @@ def purge_table(self, identifier: Union[str, Identifier]) -> None: manifest_lists_to_delete = set() manifests_to_delete: List[ManifestFile] = [] for snapshot in metadata.snapshots: - manifests_to_delete += snapshot.manifests(io, snapshot.manifest_list) + manifests_to_delete += snapshot.manifests(io) if snapshot.manifest_list is not None: manifest_lists_to_delete.add(snapshot.manifest_list) diff --git a/pyiceberg/cli/output.py b/pyiceberg/cli/output.py index e9ba377453..56b544c99f 100644 --- a/pyiceberg/cli/output.py +++ b/pyiceberg/cli/output.py @@ -144,7 +144,7 @@ def files(self, table: Table, history: bool) -> None: manifest_list_str = f": {snapshot.manifest_list}" if snapshot.manifest_list else "" list_tree = snapshot_tree.add(f"Snapshot {snapshot.snapshot_id}, schema {snapshot.schema_id}{manifest_list_str}") - manifest_list = snapshot.manifests(io, manifest_list_str) + manifest_list = snapshot.manifests(io) for manifest in manifest_list: manifest_tree = list_tree.add(f"Manifest: {manifest.manifest_path}") for manifest_entry in manifest.fetch_manifest_entry(io, discard_deleted=False): diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 7e4307f499..2d4b342461 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -1708,7 +1708,7 @@ def plan_files(self) -> Iterable[FileScanTask]: manifests = [ manifest_file - for manifest_file in snapshot.manifests(self.io, snapshot.manifest_list) + for manifest_file in snapshot.manifests(self.io) if manifest_evaluators[manifest_file.partition_spec_id](manifest_file) ] @@ -2941,7 +2941,7 @@ def _existing_manifests(self) -> List[ManifestFile]: if previous_snapshot is None: raise ValueError(f"Snapshot could not be found: {self._parent_snapshot_id}") - for manifest in previous_snapshot.manifests(io=self._io, manifest_list=previous_snapshot.manifest_list): + for manifest in previous_snapshot.manifests(io=self._io): if manifest.has_added_files() or manifest.has_existing_files() or manifest.added_snapshot_id == self._snapshot_id: existing_manifests.append(manifest) @@ -2992,7 +2992,7 @@ def _get_entries(manifest: ManifestFile) -> List[ManifestEntry]: if entry.data_file.content == DataFileContent.DATA ] - list_of_entries = executor.map(_get_entries, previous_snapshot.manifests(self._io, previous_snapshot.manifest_list)) + list_of_entries = executor.map(_get_entries, previous_snapshot.manifests(self._io)) return list(chain(*list_of_entries)) else: return [] @@ -3384,7 +3384,7 @@ def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType: entries = [] snapshot = self._get_snapshot(snapshot_id) - for manifest in snapshot.manifests(self.tbl.io, snapshot.manifest_list): + for manifest in snapshot.manifests(self.tbl.io): for entry in manifest.fetch_manifest_entry(io=self.tbl.io): column_sizes = entry.data_file.column_sizes or {} value_counts = entry.data_file.value_counts or {} @@ -3546,7 +3546,7 @@ def update_partitions_map( partitions_map: Dict[Tuple[str, Any], Any] = {} snapshot = self._get_snapshot(snapshot_id) - for manifest in snapshot.manifests(self.tbl.io, snapshot.manifest_list): + for manifest in snapshot.manifests(self.tbl.io): for entry in manifest.fetch_manifest_entry(io=self.tbl.io): partition = entry.data_file.partition partition_record_dict = { @@ -3624,7 +3624,7 @@ def _partition_summaries_to_rows( specs = self.tbl.metadata.specs() manifests = [] if snapshot := self.tbl.metadata.current_snapshot(): - for manifest in snapshot.manifests(self.tbl.io, snapshot.manifest_list): + for manifest in snapshot.manifests(self.tbl.io): is_data_file = manifest.content == ManifestContent.DATA is_delete_file = manifest.content == ManifestContent.DELETES manifests.append({ diff --git a/pyiceberg/table/snapshots.py b/pyiceberg/table/snapshots.py index 6e8096f95c..d5721f9618 100644 --- a/pyiceberg/table/snapshots.py +++ b/pyiceberg/table/snapshots.py @@ -250,13 +250,17 @@ def __str__(self) -> str: @staticmethod @lru_cache - def manifests(io: FileIO, manifest_list: str) -> List[ManifestFile]: + def _manifests(io: FileIO, manifest_list: str) -> List[ManifestFile]: """Return the manifests for the given snapshot.""" if manifest_list not in (None, ""): file = io.new_input(manifest_list) return list(read_manifest_list(file)) return [] + def manifests(self, io: FileIO) -> List[ManifestFile]: + """Return the manifests for the given snapshot.""" + return Snapshot._manifests(io, self.manifest_list) + class MetadataLogEntry(IcebergBaseModel): metadata_file: str = Field(alias="metadata-file") diff --git a/tests/integration/test_partitioning_key.py b/tests/integration/test_partitioning_key.py index f45223a224..29f664909c 100644 --- a/tests/integration/test_partitioning_key.py +++ b/tests/integration/test_partitioning_key.py @@ -763,14 +763,10 @@ def test_partition_key( snapshot = iceberg_table.current_snapshot() assert snapshot spark_partition_for_justification = ( - snapshot.manifests(iceberg_table.io, snapshot.manifest_list)[0] - .fetch_manifest_entry(iceberg_table.io)[0] - .data_file.partition + snapshot.manifests(iceberg_table.io)[0].fetch_manifest_entry(iceberg_table.io)[0].data_file.partition ) spark_path_for_justification = ( - snapshot.manifests(iceberg_table.io, snapshot.manifest_list)[0] - .fetch_manifest_entry(iceberg_table.io)[0] - .data_file.file_path + snapshot.manifests(iceberg_table.io)[0].fetch_manifest_entry(iceberg_table.io)[0].data_file.file_path ) assert spark_partition_for_justification == expected_partition_record assert expected_hive_partition_path_slice in spark_path_for_justification diff --git a/tests/integration/test_rest_manifest.py b/tests/integration/test_rest_manifest.py index 8c4b2aaf64..82c41cfd93 100644 --- a/tests/integration/test_rest_manifest.py +++ b/tests/integration/test_rest_manifest.py @@ -75,7 +75,7 @@ def test_write_sample_manifest(table_test_all_types: Table) -> None: if test_snapshot is None: raise ValueError("Table has no current snapshot, check the docker environment") io = table_test_all_types.io - test_manifest_file = test_snapshot.manifests(io, test_snapshot.manifest_list)[0] + test_manifest_file = test_snapshot.manifests(io)[0] test_manifest_entries = test_manifest_file.fetch_manifest_entry(io) entry = test_manifest_entries[0] test_schema = table_test_all_types.schema() diff --git a/tests/utils/test_manifest.py b/tests/utils/test_manifest.py index df373bef13..8bb03cd80e 100644 --- a/tests/utils/test_manifest.py +++ b/tests/utils/test_manifest.py @@ -217,7 +217,7 @@ def test_read_manifest_v1(generated_manifest_file_file_v1: str) -> None: summary=Summary(Operation.APPEND), schema_id=3, ) - manifest_list = snapshot.manifests(io, snapshot.manifest_list)[0] + manifest_list = snapshot.manifests(io)[0] assert manifest_list.manifest_length == 7989 assert manifest_list.partition_spec_id == 0 @@ -267,7 +267,7 @@ def test_read_manifest_v2(generated_manifest_file_file_v2: str) -> None: summary=Summary(Operation.APPEND), schema_id=3, ) - manifest_list = snapshot.manifests(io, manifest_list=snapshot.manifest_list)[0] + manifest_list = snapshot.manifests(io)[0] assert manifest_list.manifest_length == 7989 assert manifest_list.partition_spec_id == 0 @@ -319,7 +319,7 @@ def test_write_manifest( summary=Summary(Operation.APPEND), schema_id=3, ) - demo_manifest_file = snapshot.manifests(io, snapshot.manifest_list)[0] + demo_manifest_file = snapshot.manifests(io)[0] manifest_entries = demo_manifest_file.fetch_manifest_entry(io) test_schema = Schema( NestedField(1, "VendorID", IntegerType(), False), NestedField(2, "tpep_pickup_datetime", IntegerType(), False) @@ -491,7 +491,7 @@ def test_write_manifest_list( schema_id=3, ) - demo_manifest_list = snapshot.manifests(io, snapshot.manifest_list) + demo_manifest_list = snapshot.manifests(io) with TemporaryDirectory() as tmp_dir: path = tmp_dir + "/manifest-list.avro" output = io.new_output(path) From afbf0313bac22ee839866139897a161b5d2fa898 Mon Sep 17 00:00:00 2001 From: chinmay-bhat <12948588+chinmay-bhat@users.noreply.github.com> Date: Tue, 4 Jun 2024 13:30:59 +0530 Subject: [PATCH 3/9] small fix --- pyiceberg/table/snapshots.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyiceberg/table/snapshots.py b/pyiceberg/table/snapshots.py index d5721f9618..a810196cc4 100644 --- a/pyiceberg/table/snapshots.py +++ b/pyiceberg/table/snapshots.py @@ -250,10 +250,10 @@ def __str__(self) -> str: @staticmethod @lru_cache - def _manifests(io: FileIO, manifest_list: str) -> List[ManifestFile]: + def _manifests(io: FileIO, manifest_list: Optional[str]) -> List[ManifestFile]: """Return the manifests for the given snapshot.""" if manifest_list not in (None, ""): - file = io.new_input(manifest_list) + file = io.new_input(manifest_list) # type: ignore return list(read_manifest_list(file)) return [] From af6924d643fafba3245a866f914f2073c5072661 Mon Sep 17 00:00:00 2001 From: chinmay-bhat <12948588+chinmay-bhat@users.noreply.github.com> Date: Mon, 10 Jun 2024 10:31:44 +0530 Subject: [PATCH 4/9] move cache to module level --- pyiceberg/table/snapshots.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pyiceberg/table/snapshots.py b/pyiceberg/table/snapshots.py index a810196cc4..07ca23a3b7 100644 --- a/pyiceberg/table/snapshots.py +++ b/pyiceberg/table/snapshots.py @@ -229,6 +229,15 @@ def __eq__(self, other: Any) -> bool: ) +@lru_cache +def _manifests(io: FileIO, manifest_list: Optional[str]) -> List[ManifestFile]: + """Return the manifests from the manifest list.""" + if manifest_list not in (None, ""): + file = io.new_input(manifest_list) # type: ignore + return list(read_manifest_list(file)) + return [] + + class Snapshot(IcebergBaseModel): snapshot_id: int = Field(alias="snapshot-id") parent_snapshot_id: Optional[int] = Field(alias="parent-snapshot-id", default=None) @@ -248,18 +257,9 @@ def __str__(self) -> str: result_str = f"{operation}id={self.snapshot_id}{parent_id}{schema_id}" return result_str - @staticmethod - @lru_cache - def _manifests(io: FileIO, manifest_list: Optional[str]) -> List[ManifestFile]: - """Return the manifests for the given snapshot.""" - if manifest_list not in (None, ""): - file = io.new_input(manifest_list) # type: ignore - return list(read_manifest_list(file)) - return [] - def manifests(self, io: FileIO) -> List[ManifestFile]: """Return the manifests for the given snapshot.""" - return Snapshot._manifests(io, self.manifest_list) + return _manifests(io, self.manifest_list) class MetadataLogEntry(IcebergBaseModel): From 8c2e79a9c62f98c51eb56ae65369a7bf3f6d49f4 Mon Sep 17 00:00:00 2001 From: chinmay-bhat <12948588+chinmay-bhat@users.noreply.github.com> Date: Mon, 10 Jun 2024 10:58:02 +0530 Subject: [PATCH 5/9] update signature and check --- pyiceberg/table/snapshots.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pyiceberg/table/snapshots.py b/pyiceberg/table/snapshots.py index 07ca23a3b7..94df47f53c 100644 --- a/pyiceberg/table/snapshots.py +++ b/pyiceberg/table/snapshots.py @@ -230,12 +230,10 @@ def __eq__(self, other: Any) -> bool: @lru_cache -def _manifests(io: FileIO, manifest_list: Optional[str]) -> List[ManifestFile]: +def _manifests(io: FileIO, manifest_list: str) -> List[ManifestFile]: """Return the manifests from the manifest list.""" - if manifest_list not in (None, ""): - file = io.new_input(manifest_list) # type: ignore - return list(read_manifest_list(file)) - return [] + file = io.new_input(manifest_list) + return list(read_manifest_list(file)) class Snapshot(IcebergBaseModel): @@ -259,7 +257,9 @@ def __str__(self) -> str: def manifests(self, io: FileIO) -> List[ManifestFile]: """Return the manifests for the given snapshot.""" - return _manifests(io, self.manifest_list) + if self.manifest_list: + return _manifests(io, self.manifest_list) + return [] class MetadataLogEntry(IcebergBaseModel): From 22c9704057450562929b9e726e0d07c5edb5d973 Mon Sep 17 00:00:00 2001 From: chinmay-bhat <12948588+chinmay-bhat@users.noreply.github.com> Date: Wed, 12 Jun 2024 21:28:17 +0530 Subject: [PATCH 6/9] test: remove timezone from test --- tests/integration/test_writes/test_writes.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/tests/integration/test_writes/test_writes.py b/tests/integration/test_writes/test_writes.py index e329adcd5c..0e82162e79 100644 --- a/tests/integration/test_writes/test_writes.py +++ b/tests/integration/test_writes/test_writes.py @@ -16,8 +16,6 @@ # under the License. # pylint:disable=redefined-outer-name import math -import os -import time from datetime import date, datetime from pathlib import Path from typing import Any, Dict @@ -26,7 +24,6 @@ import pyarrow as pa import pyarrow.parquet as pq import pytest -import pytz from pyarrow.fs import S3FileSystem from pydantic_core import ValidationError from pyspark.sql import SparkSession @@ -542,9 +539,9 @@ def test_summaries_with_only_nulls( @pytest.mark.integration def test_duckdb_url_import(warehouse: Path, arrow_table_with_null: pa.Table) -> None: - os.environ["TZ"] = "Etc/UTC" - time.tzset() - tz = pytz.timezone(os.environ["TZ"]) + # os.environ["TZ"] = "Etc/UTC" + # time.tzset() + # tz = pytz.timezone(os.environ["TZ"]) catalog = SqlCatalog("test_sql_catalog", uri="sqlite:///:memory:", warehouse=f"/{warehouse}") catalog.create_namespace("default") @@ -573,7 +570,7 @@ def test_duckdb_url_import(warehouse: Path, arrow_table_with_null: pa.Table) -> 0.0, 0.0, datetime(2023, 1, 1, 19, 25), - datetime(2023, 1, 1, 19, 25, tzinfo=tz), + datetime(2023, 1, 1, 19, 25), date(2023, 1, 1), b"\x01", b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", @@ -588,7 +585,7 @@ def test_duckdb_url_import(warehouse: Path, arrow_table_with_null: pa.Table) -> 0.8999999761581421, 0.9, datetime(2023, 3, 1, 19, 25), - datetime(2023, 3, 1, 19, 25, tzinfo=tz), + datetime(2023, 3, 1, 19, 25), date(2023, 3, 1), b"\x12", b"\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11", From c794d805e7dd4593d0dea169a2753d70d6383096 Mon Sep 17 00:00:00 2001 From: chinmay-bhat <12948588+chinmay-bhat@users.noreply.github.com> Date: Wed, 12 Jun 2024 21:45:18 +0530 Subject: [PATCH 7/9] test: remove setting local timezone --- tests/integration/test_writes/test_writes.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_writes/test_writes.py b/tests/integration/test_writes/test_writes.py index 0e82162e79..15f84506a5 100644 --- a/tests/integration/test_writes/test_writes.py +++ b/tests/integration/test_writes/test_writes.py @@ -24,6 +24,7 @@ import pyarrow as pa import pyarrow.parquet as pq import pytest +import pytz from pyarrow.fs import S3FileSystem from pydantic_core import ValidationError from pyspark.sql import SparkSession @@ -541,7 +542,7 @@ def test_summaries_with_only_nulls( def test_duckdb_url_import(warehouse: Path, arrow_table_with_null: pa.Table) -> None: # os.environ["TZ"] = "Etc/UTC" # time.tzset() - # tz = pytz.timezone(os.environ["TZ"]) + tz = pytz.timezone("Etc/UTC") catalog = SqlCatalog("test_sql_catalog", uri="sqlite:///:memory:", warehouse=f"/{warehouse}") catalog.create_namespace("default") @@ -570,7 +571,7 @@ def test_duckdb_url_import(warehouse: Path, arrow_table_with_null: pa.Table) -> 0.0, 0.0, datetime(2023, 1, 1, 19, 25), - datetime(2023, 1, 1, 19, 25), + datetime(2023, 1, 1, 19, 25, tzinfo=tz), date(2023, 1, 1), b"\x01", b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", @@ -585,7 +586,7 @@ def test_duckdb_url_import(warehouse: Path, arrow_table_with_null: pa.Table) -> 0.8999999761581421, 0.9, datetime(2023, 3, 1, 19, 25), - datetime(2023, 3, 1, 19, 25), + datetime(2023, 3, 1, 19, 25, tzinfo=tz), date(2023, 3, 1), b"\x12", b"\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11", From 2ba9508ce6a5fc39d7d7eb1fe8889746d92b4588 Mon Sep 17 00:00:00 2001 From: Chinmay Bhat <12948588+chinmay-bhat@users.noreply.github.com> Date: Wed, 12 Jun 2024 23:01:18 +0530 Subject: [PATCH 8/9] Update python-ci.yml Since integration-test seems to run and test-coverage didnt, there might be a discrepancy in environment setup. This commit updates the `install`workflow. --- .github/workflows/python-ci.yml | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml index 2983c890af..6a94e0fbfc 100644 --- a/.github/workflows/python-ci.yml +++ b/.github/workflows/python-ci.yml @@ -38,15 +38,8 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Install poetry - run: make install-poetry - - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python }} - cache: poetry - cache-dependency-path: ./poetry.lock - name: Install - run: make install-dependencies + run: make install - name: Linters run: make lint - name: Tests From b076b4e00179cb606eefb52435e36cfc5af64b3b Mon Sep 17 00:00:00 2001 From: chinmay-bhat <12948588+chinmay-bhat@users.noreply.github.com> Date: Wed, 12 Jun 2024 23:16:51 +0530 Subject: [PATCH 9/9] Revert changes to test_duck_db_url --- tests/integration/test_writes/test_writes.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_writes/test_writes.py b/tests/integration/test_writes/test_writes.py index 15f84506a5..e329adcd5c 100644 --- a/tests/integration/test_writes/test_writes.py +++ b/tests/integration/test_writes/test_writes.py @@ -16,6 +16,8 @@ # under the License. # pylint:disable=redefined-outer-name import math +import os +import time from datetime import date, datetime from pathlib import Path from typing import Any, Dict @@ -540,9 +542,9 @@ def test_summaries_with_only_nulls( @pytest.mark.integration def test_duckdb_url_import(warehouse: Path, arrow_table_with_null: pa.Table) -> None: - # os.environ["TZ"] = "Etc/UTC" - # time.tzset() - tz = pytz.timezone("Etc/UTC") + os.environ["TZ"] = "Etc/UTC" + time.tzset() + tz = pytz.timezone(os.environ["TZ"]) catalog = SqlCatalog("test_sql_catalog", uri="sqlite:///:memory:", warehouse=f"/{warehouse}") catalog.create_namespace("default")