-
Notifications
You must be signed in to change notification settings - Fork 451
feat: Add table.maintenance.compact() for full-table data file compaction #3124
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
5c8dc67
2774bd3
6420027
dfbde71
9fd51a8
93df231
edf449e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,113 @@ | ||
| # Licensed to the Apache Software Foundation (ASF) under one | ||
| # or more contributor license agreements. See the NOTICE file | ||
| # distributed with this work for additional information | ||
| # regarding copyright ownership. The ASF licenses this file | ||
| # to you under the Apache License, Version 2.0 (the | ||
| # "License"); you may not use this file except in compliance | ||
| # with the License. You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, | ||
| # software distributed under the License is distributed on an | ||
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| # KIND, either express or implied. See the License for the | ||
| # specific language governing permissions and limitations | ||
| # under the License. | ||
| import random | ||
|
|
||
| import pyarrow as pa | ||
|
|
||
| from pyiceberg.catalog import Catalog | ||
| from pyiceberg.exceptions import NoSuchNamespaceError | ||
| from pyiceberg.partitioning import PartitionField, PartitionSpec | ||
| from pyiceberg.schema import Schema | ||
| from pyiceberg.transforms import IdentityTransform | ||
|
|
||
|
|
||
| def test_maintenance_compact(catalog: Catalog) -> None: | ||
| # Setup Schema and specs | ||
| from pyiceberg.types import LongType, NestedField, StringType | ||
|
|
||
| schema = Schema( | ||
| NestedField(1, "id", LongType()), | ||
| NestedField(2, "category", StringType()), | ||
| NestedField(3, "value", LongType()), | ||
| ) | ||
| spec = PartitionSpec(PartitionField(source_id=2, field_id=1000, transform=IdentityTransform(), name="category")) | ||
|
|
||
| # Create the namespace and table | ||
| try: | ||
| catalog.create_namespace("default") | ||
| except NoSuchNamespaceError: | ||
| pass | ||
| table = catalog.create_table( | ||
| "default.test_compaction", | ||
| schema=schema, | ||
| partition_spec=spec, | ||
| ) | ||
|
|
||
| # Append many small data files | ||
| categories = ["cat1", "cat2", "cat3"] | ||
| for i in range(12): | ||
| table.append( | ||
| pa.table( | ||
| { | ||
| "id": list(range(i * 10, (i + 1) * 10)), | ||
| "category": [categories[i % 3]] * 10, | ||
| "value": [random.randint(1, 100) for _ in range(10)], | ||
| } | ||
| ) | ||
| ) | ||
|
|
||
| # Verify state before compaction | ||
| before_files = list(table.scan().plan_files()) | ||
| assert len(before_files) == 12 | ||
| arrow_table_before = table.scan().to_arrow() | ||
| assert arrow_table_before.num_rows == 120 | ||
|
|
||
| # Execute Compaction | ||
| table.maintenance.compact() | ||
|
|
||
| # Verify state after compaction | ||
| table.refresh() | ||
| after_files = list(table.scan().plan_files()) | ||
| assert len(after_files) == 3 # Should be 1 optimized data file per partition | ||
|
|
||
| arrow_table_after = table.scan().to_arrow() | ||
| assert arrow_table_after.num_rows == 120 | ||
| assert arrow_table_before.column_names == arrow_table_after.column_names | ||
| assert sorted(arrow_table_before.to_pylist(), key=lambda x: x["id"]) == sorted( | ||
| arrow_table_after.to_pylist(), key=lambda x: x["id"] | ||
| ) | ||
|
|
||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. since its a small result set, we should verify the data is the same too
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @kevinjqliu, made a change in 6420027 to check that the columns and the primary keys remain the same before/after compaction. |
||
| # Ensure snapshot properties specify the replace-operation | ||
| new_snapshot = table.current_snapshot() | ||
| assert new_snapshot is not None | ||
| assert new_snapshot.summary is not None | ||
| assert new_snapshot.summary.get("snapshot-type") == "replace" | ||
| assert new_snapshot.summary.get("replace-operation") == "compaction" | ||
|
|
||
|
|
||
| def test_maintenance_compact_empty_table(catalog: Catalog) -> None: | ||
| from pyiceberg.types import LongType, NestedField, StringType | ||
|
|
||
| schema = Schema( | ||
| NestedField(1, "id", LongType()), | ||
| NestedField(2, "category", StringType()), | ||
| ) | ||
|
|
||
| try: | ||
| catalog.create_namespace("default") | ||
| except NoSuchNamespaceError: | ||
| pass | ||
|
|
||
| table = catalog.create_table("default.test_compaction_empty", schema=schema) | ||
| before_snapshots = len(table.history()) | ||
|
|
||
| # Should safely return doing nothing | ||
| table.maintenance.compact() | ||
|
|
||
| table.refresh() | ||
| after_snapshots = len(table.history()) | ||
| assert before_snapshots == after_snapshots # No new snapshot should be made | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
lets add
replaceon its own since its a pretty significant change and follow up with table compaction.i think there are a few more things we need to add to the
replaceoperation. Would be a good idea to look into the java side. For example, how can we ensure that the table's data remains the same? REPLACE means no data change. If we cannot guarantee that the data remains the same, maybe we should not expose areplacefunction that takes adfas a parameterThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi @kevinjqliu, I created an issue (#3130) and a corresponding PR (#3131) to address the need to create a separate PR for
replace. When approved, we can use that to build and complete this current PR for compaction. We can move this discussion to there and come back when finished.