Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions pyiceberg/table/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -614,6 +614,54 @@ def overwrite(
for data_file in data_files:
append_files.append_data_file(data_file)

def replace(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lets add replace on its own since its a pretty significant change and follow up with table compaction.

i think there are a few more things we need to add to the replace operation. Would be a good idea to look into the java side. For example, how can we ensure that the table's data remains the same? REPLACE means no data change. If we cannot guarantee that the data remains the same, maybe we should not expose a replace function that takes a df as a parameter

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @kevinjqliu, I created an issue (#3130) and a corresponding PR (#3131) to address the need to create a separate PR for replace. When approved, we can use that to build and complete this current PR for compaction. We can move this discussion to there and come back when finished.

self,
df: pa.Table,
files_to_delete: Iterable[DataFile],
snapshot_properties: dict[str, str] = EMPTY_DICT,
branch: str | None = MAIN_BRANCH,
) -> None:
"""
Shorthand for replacing existing files.

A replace will produce a REPLACE snapshot that will ignore existing
files and replace them with the new files.

Args:
df: The Arrow dataframe that will be used to generate the new data files
files_to_delete: The files to delete
snapshot_properties: Custom properties to be added to the snapshot summary
branch: Branch Reference to run the replace operation
"""
try:
import pyarrow as pa
except ModuleNotFoundError as e:
raise ModuleNotFoundError("For writes PyArrow needs to be installed") from e

from pyiceberg.io.pyarrow import _check_pyarrow_schema_compatible, _dataframe_to_data_files

if not isinstance(df, pa.Table):
raise ValueError(f"Expected PyArrow table, got: {df}")

downcast_ns_timestamp_to_us = Config().get_bool(DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE) or False
_check_pyarrow_schema_compatible(
self.table_metadata.schema(),
provided_schema=df.schema,
downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us,
format_version=self.table_metadata.format_version,
)

with self.update_snapshot(snapshot_properties=snapshot_properties, branch=branch).replace() as replace_snapshot:
for file_to_delete in files_to_delete:
replace_snapshot.delete_data_file(file_to_delete)

if df.shape[0] > 0:
data_files = _dataframe_to_data_files(
table_metadata=self.table_metadata, write_uuid=replace_snapshot.commit_uuid, df=df, io=self._table.io
)
for data_file in data_files:
replace_snapshot.append_data_file(data_file)

def delete(
self,
delete_filter: str | BooleanExpression,
Expand Down Expand Up @@ -1432,6 +1480,33 @@ def overwrite(
branch=branch,
)

def replace(
self,
df: pa.Table,
files_to_delete: Iterable[DataFile],
snapshot_properties: dict[str, str] = EMPTY_DICT,
branch: str | None = MAIN_BRANCH,
) -> None:
"""
Shorthand for replacing existing files.

A replace will produce a REPLACE snapshot that will ignore existing
files and replace them with the new files.

Args:
df: The Arrow dataframe that will be used to generate the new data files
files_to_delete: The files to delete
snapshot_properties: Custom properties to be added to the snapshot summary
branch: Branch Reference to run the replace operation
"""
with self.transaction() as tx:
tx.replace(
df=df,
files_to_delete=files_to_delete,
snapshot_properties=snapshot_properties,
branch=branch,
)

def delete(
self,
delete_filter: BooleanExpression | str = ALWAYS_TRUE,
Expand Down
28 changes: 28 additions & 0 deletions pyiceberg/table/maintenance.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,31 @@ def expire_snapshots(self) -> ExpireSnapshots:
from pyiceberg.table.update.snapshot import ExpireSnapshots

return ExpireSnapshots(transaction=Transaction(self.tbl, autocommit=True))

def compact(self) -> None:
"""Compact the table's data and delete files by reading and overwriting the entire table.

Note: This is a full-table compaction that leverages Arrow for binpacking.
It currently reads the entire table into memory via `.to_arrow()`.

This reads all existing data into memory and writes it back out using the
target file size settings (write.target-file-size-bytes), atomically
dropping the old files and replacing them with fewer, larger files.
"""
# Read the current table state into memory
arrow_table = self.tbl.scan().to_arrow()

# Guard: if the table is completely empty, there's nothing to compact.
# Doing an overwrite with an empty table would result in deleting everything.
if arrow_table.num_rows == 0:
logger.info("Table contains no rows, skipping compaction.")
return

# Replace existing files with new compacted files
with self.tbl.transaction() as txn:
files_to_delete = [task.file for task in self.tbl.scan().plan_files()]
txn.replace(
df=arrow_table,
files_to_delete=files_to_delete,
snapshot_properties={"snapshot-type": "replace", "replace-operation": "compaction"},
)
2 changes: 1 addition & 1 deletion pyiceberg/table/snapshots.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@ def _partition_summary(self, update_metrics: UpdateMetrics) -> str:


def update_snapshot_summaries(summary: Summary, previous_summary: Mapping[str, str] | None = None) -> Summary:
if summary.operation not in {Operation.APPEND, Operation.OVERWRITE, Operation.DELETE}:
if summary.operation not in {Operation.APPEND, Operation.OVERWRITE, Operation.DELETE, Operation.REPLACE}:
raise ValueError(f"Operation not implemented: {summary.operation}")

if not previous_summary:
Expand Down
95 changes: 95 additions & 0 deletions pyiceberg/table/update/snapshot.py
Original file line number Diff line number Diff line change
Expand Up @@ -667,6 +667,91 @@ def _get_entries(manifest: ManifestFile) -> list[ManifestEntry]:
return []


class _RewriteFiles(_SnapshotProducer["_RewriteFiles"]):
"""Rewrites data in the table. This will produce a REPLACE snapshot.

Data files were logically rearranged, but no new logical records were
added or removed (e.g. compaction).
"""

def _existing_manifests(self) -> list[ManifestFile]:
"""Determine if there are any existing manifest files."""
existing_files = []

manifest_evaluators: dict[int, Callable[[ManifestFile], bool]] = KeyDefaultDict(self._build_manifest_evaluator)
if snapshot := self._transaction.table_metadata.snapshot_by_name(name=self._target_branch):
for manifest_file in snapshot.manifests(io=self._io):
# Manifest does not contain rows that match the files to delete partitions
if not manifest_evaluators[manifest_file.partition_spec_id](manifest_file):
existing_files.append(manifest_file)
continue

entries_to_write: set[ManifestEntry] = set()
found_deleted_entries: set[ManifestEntry] = set()

for entry in manifest_file.fetch_manifest_entry(io=self._io, discard_deleted=True):
if entry.data_file in self._deleted_data_files:
found_deleted_entries.add(entry)
else:
entries_to_write.add(entry)

# Is the intercept the empty set?
if len(found_deleted_entries) == 0:
existing_files.append(manifest_file)
continue

# Delete all files from manifest
if len(entries_to_write) == 0:
continue

# We have to rewrite the manifest file without the deleted data files
with self.new_manifest_writer(self.spec(manifest_file.partition_spec_id)) as writer:
for entry in entries_to_write:
writer.add_entry(
ManifestEntry.from_args(
status=ManifestEntryStatus.EXISTING,
snapshot_id=entry.snapshot_id,
sequence_number=entry.sequence_number,
file_sequence_number=entry.file_sequence_number,
data_file=entry.data_file,
)
)
existing_files.append(writer.to_manifest_file())

return existing_files

def _deleted_entries(self) -> list[ManifestEntry]:
"""To determine if we need to record any deleted entries."""
if self._parent_snapshot_id is not None:
previous_snapshot = self._transaction.table_metadata.snapshot_by_id(self._parent_snapshot_id)
if previous_snapshot is None:
raise ValueError(f"Could not find the previous snapshot: {self._parent_snapshot_id}")

executor = ExecutorFactory.get_or_create()
manifest_evaluators: dict[int, Callable[[ManifestFile], bool]] = KeyDefaultDict(self._build_manifest_evaluator)

def _get_entries(manifest: ManifestFile) -> list[ManifestEntry]:
if not manifest_evaluators[manifest.partition_spec_id](manifest):
return []

return [
ManifestEntry.from_args(
status=ManifestEntryStatus.DELETED,
snapshot_id=entry.snapshot_id,
sequence_number=entry.sequence_number,
file_sequence_number=entry.file_sequence_number,
data_file=entry.data_file,
)
for entry in manifest.fetch_manifest_entry(self._io, discard_deleted=True)
if entry.data_file.content == DataFileContent.DATA and entry.data_file in self._deleted_data_files
]

list_of_entries = executor.map(_get_entries, previous_snapshot.manifests(self._io))
return list(itertools.chain(*list_of_entries))
else:
return []


class UpdateSnapshot:
_transaction: Transaction
_io: FileIO
Expand Down Expand Up @@ -715,6 +800,16 @@ def overwrite(self, commit_uuid: uuid.UUID | None = None) -> _OverwriteFiles:
snapshot_properties=self._snapshot_properties,
)

def replace(self, commit_uuid: uuid.UUID | None = None) -> _RewriteFiles:
return _RewriteFiles(
commit_uuid=commit_uuid,
operation=Operation.REPLACE,
transaction=self._transaction,
io=self._io,
branch=self._branch,
snapshot_properties=self._snapshot_properties,
)

def delete(self) -> _DeleteFiles:
return _DeleteFiles(
operation=Operation.DELETE,
Expand Down
113 changes: 113 additions & 0 deletions tests/table/test_maintenance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import random

import pyarrow as pa

from pyiceberg.catalog import Catalog
from pyiceberg.exceptions import NoSuchNamespaceError
from pyiceberg.partitioning import PartitionField, PartitionSpec
from pyiceberg.schema import Schema
from pyiceberg.transforms import IdentityTransform


def test_maintenance_compact(catalog: Catalog) -> None:
# Setup Schema and specs
from pyiceberg.types import LongType, NestedField, StringType

schema = Schema(
NestedField(1, "id", LongType()),
NestedField(2, "category", StringType()),
NestedField(3, "value", LongType()),
)
spec = PartitionSpec(PartitionField(source_id=2, field_id=1000, transform=IdentityTransform(), name="category"))

# Create the namespace and table
try:
catalog.create_namespace("default")
except NoSuchNamespaceError:
pass
table = catalog.create_table(
"default.test_compaction",
schema=schema,
partition_spec=spec,
)

# Append many small data files
categories = ["cat1", "cat2", "cat3"]
for i in range(12):
table.append(
pa.table(
{
"id": list(range(i * 10, (i + 1) * 10)),
"category": [categories[i % 3]] * 10,
"value": [random.randint(1, 100) for _ in range(10)],
}
)
)

# Verify state before compaction
before_files = list(table.scan().plan_files())
assert len(before_files) == 12
arrow_table_before = table.scan().to_arrow()
assert arrow_table_before.num_rows == 120

# Execute Compaction
table.maintenance.compact()

# Verify state after compaction
table.refresh()
after_files = list(table.scan().plan_files())
assert len(after_files) == 3 # Should be 1 optimized data file per partition

arrow_table_after = table.scan().to_arrow()
assert arrow_table_after.num_rows == 120
assert arrow_table_before.column_names == arrow_table_after.column_names
assert sorted(arrow_table_before.to_pylist(), key=lambda x: x["id"]) == sorted(
arrow_table_after.to_pylist(), key=lambda x: x["id"]
)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

since its a small result set, we should verify the data is the same too

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @kevinjqliu, made a change in 6420027 to check that the columns and the primary keys remain the same before/after compaction.

# Ensure snapshot properties specify the replace-operation
new_snapshot = table.current_snapshot()
assert new_snapshot is not None
assert new_snapshot.summary is not None
assert new_snapshot.summary.get("snapshot-type") == "replace"
assert new_snapshot.summary.get("replace-operation") == "compaction"


def test_maintenance_compact_empty_table(catalog: Catalog) -> None:
from pyiceberg.types import LongType, NestedField, StringType

schema = Schema(
NestedField(1, "id", LongType()),
NestedField(2, "category", StringType()),
)

try:
catalog.create_namespace("default")
except NoSuchNamespaceError:
pass

table = catalog.create_table("default.test_compaction_empty", schema=schema)
before_snapshots = len(table.history())

# Should safely return doing nothing
table.maintenance.compact()

table.refresh()
after_snapshots = len(table.history())
assert before_snapshots == after_snapshots # No new snapshot should be made
4 changes: 2 additions & 2 deletions tests/table/test_snapshots.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,8 +398,8 @@ def test_merge_snapshot_summaries_overwrite_summary() -> None:

def test_invalid_operation() -> None:
with pytest.raises(ValueError) as e:
update_snapshot_summaries(summary=Summary(Operation.REPLACE))
assert "Operation not implemented: Operation.REPLACE" in str(e.value)
update_snapshot_summaries(summary=Summary(Operation("invalid")))
assert "'invalid' is not a valid Operation" in str(e.value)


def test_invalid_type() -> None:
Expand Down