-
Notifications
You must be signed in to change notification settings - Fork 545
Closed
Labels
binding/rustIssues for the Rust crateIssues for the Rust cratebugSomething isn't workingSomething isn't workingfix-awaiting-releaseIssues which have a fixed merged or pendingIssues which have a fixed merged or pendingroad-to-1.0Going from zero to oneGoing from zero to one
Description
Environment
Delta-rs version:
0.19.1
(Test 0.19.0 and it failed as well)
Environment:
- Cloud provider: Local (and Google Cloud)
- OS: M2 Macbook (and amd64 Linux containers)
Bug
What happened:
In 0.19.1, we can no longer write to Delta Tables using schemas that have descriptions set for the individual columns it seems.
What you expected to happen:
We include the description per column so it is available downstream (OpenLineage etc.) and in 0.18.2 and previous versions we were able to write data successfully.
How to reproduce it:
import json
from datetime import UTC, datetime
import deltalake
import pyarrow as pa
# upgrade deltalake to 0.19.1
# Installed 1 package in 5ms
# - deltalake==0.18.2
# + deltalake==0.19.1
schema = pa.schema(
[
pa.field(
"month_id",
pa.int32(),
nullable=False,
metadata={"description": "The month in integer format (YYYYMM)"},
),
pa.field(
"date_id",
pa.int32(),
nullable=False,
metadata={"description": "The date in integer format (YYYYMMDD)"},
),
pa.field(
"unique_row_hash",
pa.string(),
nullable=False,
metadata={"description": "A hash of the row to identify duplicates"},
),
pa.field(
"time_on_system_seconds",
pa.int32(),
metadata={"description": "Total time on system in seconds"},
),
pa.field(
"idle_seconds", pa.int32()
),
]
)
data = {
"month_id": [202408, 202408, 202409, 202409, 202409],
"date_id": [20240831, 20240831, 20240901, 20240901, 20240902],
"unique_row_hash": ["hash_1", "hash_2", "hash_3", "hash_4", "hash_5"],
"time_on_system_seconds": [25200, 18000, 28800, 21600, 27000],
"idle_seconds": [1800, 900, 2700, 1200, 3000],
}
table = pa.Table.from_pydict(data, schema=schema)
uri = f"test_error_{deltalake.__version__}"
name = "test"
description = "This is a test table."
partitions = ["month_id"]
metadata = {"unique_constraint": json.dumps(["id"])}
storage_options = None
# create initial table
deltalake.DeltaTable.create(
table_uri=uri,
schema=schema,
partition_by=partitions,
storage_options=storage_options,
name=name,
description=description,
mode="overwrite",
custom_metadata=metadata,
configuration={
"delta.dataSkippingStatsColumns": "month_id",
"delta.checkpoint.writeStatsAsStruct": "true",
"delta.checkpoint.writeStatsAsJson": "true",
},
)
print(f"Created {uri}")
def write_data():
deltalake.write_deltalake(
table_or_uri=uri,
data=table,
schema=schema,
partition_by=partitions,
storage_options=storage_options,
name=name,
description=description,
custom_metadata=metadata,
mode="overwrite",
engine="rust",
)
dt = deltalake.DeltaTable(uri)
print(dt.history())
# this fails
write_data()Error:
--> 303 write_deltalake_rust(
304 table_uri=table_uri,
305 data=data,
306 partition_by=partition_by,
307 mode=mode,
308 table=table._table if table is not None else None,
309 schema_mode=schema_mode,
310 predicate=predicate,
311 name=name,
312 description=description,
313 configuration=configuration,
314 storage_options=storage_options,
315 writer_properties=writer_properties,
316 custom_metadata=custom_metadata,
317 post_commithook_properties=post_commithook_properties.__dict__
318 if post_commithook_properties
319 else None,
320 )
321 if table:
322 table.update_incremental()
SchemaMismatchError: Schema error: Cannot merge metadata with different values for key description
More details:
This works if you downgrade back to 0.18.2:
# Installed 1 package in 7ms
# - deltalake==0.19.1
# + deltalake==0.18.2
{'timestamp': 1725640131418,
'operation': 'WRITE',
'operationParameters': {'partitionBy': '["month_id"]', 'mode': 'Overwrite'},
'clientVersion': 'delta-rs.0.18.1',
'unique_constraint': '["id"]',
'version': 2},
{'timestamp': 1725640131413,
'operation': 'WRITE',
'operationParameters': {'partitionBy': '["month_id"]', 'mode': 'Overwrite'},
'unique_constraint': '["id"]',
'clientVersion': 'delta-rs.0.18.1',
'version': 1},
{'timestamp': 1725640131374,
'operation': 'CREATE OR REPLACE TABLE',
'operationParameters': {'protocol': '{"minReaderVersion":1,"minWriterVersion":2}',
'mode': 'Overwrite',
'metadata': '{"configuration":{"delta.checkpoint.writeStatsAsJson":"true","delta.checkpoint.writeStatsAsStruct":"true","delta.dataSkippingStatsColumns":"month_id"},"createdTime":1725640131371,"description":"This is a test table.","format":{"options":{},"provider":"parquet"},"id":"4715309f-90e1-428d-bc17-ec1d9a468fd7","name":"test","partitionColumns":["month_id"],"schemaString":"{\\"type\\":\\"struct\\",\\"fields\\":[{\\"name\\":\\"month_id\\",\\"type\\":\\"integer\\",\\"nullable\\":false,\\"metadata\\":{\\"description\\":\\"The month in integer format (YYYYMM)\\"}},{\\"name\\":\\"date_id\\",\\"type\\":\\"integer\\",\\"nullable\\":false,\\"metadata\\":{\\"description\\":\\"The date in integer format (YYYYMMDD)\\"}},{\\"name\\":\\"unique_row_hash\\",\\"type\\":\\"string\\",\\"nullable\\":false,\\"metadata\\":{\\"description\\":\\"A hash of the row to identify duplicates\\"}},{\\"name\\":\\"time_on_system_seconds\\",\\"type\\":\\"integer\\",\\"nullable\\":true,\\"metadata\\":{\\"description\\":\\"Total time on system in seconds\\"}},{\\"name\\":\\"idle_seconds\\",\\"type\\":\\"integer\\",\\"nullable\\":true,\\"metadata\\":{}}]}"}',
'location': 'file://test_error_0.18.2'},
'clientVersion': 'delta-rs.0.18.1',
'unique_constraint': '["id"]',
'version': 0}]
Metadata
Metadata
Assignees
Labels
binding/rustIssues for the Rust crateIssues for the Rust cratebugSomething isn't workingSomething isn't workingfix-awaiting-releaseIssues which have a fixed merged or pendingIssues which have a fixed merged or pendingroad-to-1.0Going from zero to oneGoing from zero to one