diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 0dfc5eb55a..2e811dbedd 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -203,6 +203,10 @@ MAP_VALUE_NAME = "value" DOC = "doc" UTC_ALIASES = {"UTC", "+00:00", "Etc/UTC", "Z"} +UUID_FILTER_NOT_SUPPORTED_ERROR_MESSAGE = ( + f"Filtering on UUID columns is not supported by the installed PyArrow version ({pa.__version__})" +) + T = TypeVar("T") @@ -1641,7 +1645,12 @@ def _task_to_record_batches( bound_row_filter, file_schema, case_sensitive=case_sensitive, projected_field_values=projected_missing_fields ) bound_file_filter = bind(file_schema, translated_row_filter, case_sensitive=case_sensitive) - pyarrow_filter = expression_to_pyarrow(bound_file_filter, file_schema) + try: + pyarrow_filter = expression_to_pyarrow(bound_file_filter, file_schema) + except pyarrow.lib.ArrowNotImplementedError as e: + if "arrow.uuid" in str(e): + raise NotImplementedError(UUID_FILTER_NOT_SUPPORTED_ERROR_MESSAGE) from e + raise file_project_schema = prune_columns(file_schema, projected_field_ids, select_full_types=False) diff --git a/tests/integration/test_reads.py b/tests/integration/test_reads.py index 6c8b4a20a7..6a60c0bfa2 100644 --- a/tests/integration/test_reads.py +++ b/tests/integration/test_reads.py @@ -820,17 +820,20 @@ def test_partitioned_tables(catalog: Catalog) -> None: @pytest.mark.parametrize("catalog", [lf("session_catalog_hive"), lf("session_catalog")]) def test_unpartitioned_uuid_table(catalog: Catalog) -> None: unpartitioned_uuid = catalog.load_table("default.test_uuid_and_fixed_unpartitioned") - arrow_table_eq = unpartitioned_uuid.scan(row_filter="uuid_col == '102cb62f-e6f8-4eb0-9973-d9b012ff0967'").to_arrow() - assert arrow_table_eq["uuid_col"].to_pylist() == [uuid.UUID("102cb62f-e6f8-4eb0-9973-d9b012ff0967")] - - arrow_table_neq = unpartitioned_uuid.scan( - row_filter="uuid_col != '102cb62f-e6f8-4eb0-9973-d9b012ff0967' and uuid_col != '639cccce-c9d2-494a-a78c-278ab234f024'" - ).to_arrow() - assert arrow_table_neq["uuid_col"].to_pylist() == [ - uuid.UUID("ec33e4b2-a834-4cc3-8c4a-a1d3bfc2f226"), - uuid.UUID("c1b0d8e0-0b0e-4b1e-9b0a-0e0b0d0c0a0b"), - uuid.UUID("923dae77-83d6-47cd-b4b0-d383e64ee57e"), - ] + try: + arrow_table_eq = unpartitioned_uuid.scan(row_filter="uuid_col == '102cb62f-e6f8-4eb0-9973-d9b012ff0967'").to_arrow() + assert arrow_table_eq["uuid_col"].to_pylist() == [uuid.UUID("102cb62f-e6f8-4eb0-9973-d9b012ff0967")] + + arrow_table_neq = unpartitioned_uuid.scan( + row_filter="uuid_col != '102cb62f-e6f8-4eb0-9973-d9b012ff0967' and uuid_col != '639cccce-c9d2-494a-a78c-278ab234f024'" + ).to_arrow() + assert arrow_table_neq["uuid_col"].to_pylist() == [ + uuid.UUID("ec33e4b2-a834-4cc3-8c4a-a1d3bfc2f226"), + uuid.UUID("c1b0d8e0-0b0e-4b1e-9b0a-0e0b0d0c0a0b"), + uuid.UUID("923dae77-83d6-47cd-b4b0-d383e64ee57e"), + ] + except NotImplementedError as e: + assert "Filtering on UUID columns is not supported" in str(e) @pytest.mark.integration @@ -840,14 +843,11 @@ def test_unpartitioned_fixed_table(catalog: Catalog) -> None: arrow_table_eq = fixed_table.scan(row_filter=EqualTo("fixed_col", b"1234567890123456789012345")).to_arrow() assert arrow_table_eq["fixed_col"].to_pylist() == [b"1234567890123456789012345"] - arrow_table_neq = fixed_table.scan( - row_filter=And( - NotEqualTo("fixed_col", b"1234567890123456789012345"), NotEqualTo("uuid_col", "c1b0d8e0-0b0e-4b1e-9b0a-0e0b0d0c0a0b") - ) - ).to_arrow() + arrow_table_neq = fixed_table.scan(row_filter=NotEqualTo("fixed_col", b"1234567890123456789012345")).to_arrow() assert arrow_table_neq["fixed_col"].to_pylist() == [ b"1231231231231231231231231", b"12345678901234567ass12345", + b"asdasasdads12312312312111", b"qweeqwwqq1231231231231111", ]