Skip to content

Schema metadata descriptions cause write failures in deltalake >= 0.19.0 #2850

@ldacey

Description

@ldacey

Environment

Delta-rs version:
0.19.1

(Test 0.19.0 and it failed as well)

Environment:

  • Cloud provider: Local (and Google Cloud)
  • OS: M2 Macbook (and amd64 Linux containers)

Bug

What happened:
In 0.19.1, we can no longer write to Delta Tables using schemas that have descriptions set for the individual columns it seems.

What you expected to happen:
We include the description per column so it is available downstream (OpenLineage etc.) and in 0.18.2 and previous versions we were able to write data successfully.

How to reproduce it:

import json
from datetime import UTC, datetime

import deltalake
import pyarrow as pa

# upgrade deltalake to 0.19.1
# Installed 1 package in 5ms
#  - deltalake==0.18.2
#  + deltalake==0.19.1


schema = pa.schema(
    [
        pa.field(
            "month_id",
            pa.int32(),
            nullable=False,
            metadata={"description": "The month in integer format (YYYYMM)"},
        ),
        pa.field(
            "date_id",
            pa.int32(),
            nullable=False,
            metadata={"description": "The date in integer format (YYYYMMDD)"},
        ),
        pa.field(
            "unique_row_hash",
            pa.string(),
            nullable=False,
            metadata={"description": "A hash of the row to identify duplicates"},
        ),
        pa.field(
            "time_on_system_seconds",
            pa.int32(),
            metadata={"description": "Total time on system in seconds"},
        ),
        pa.field(
            "idle_seconds", pa.int32()
        ),
    ]
)


data = {
    "month_id": [202408, 202408, 202409, 202409, 202409],
    "date_id": [20240831, 20240831, 20240901, 20240901, 20240902],
    "unique_row_hash": ["hash_1", "hash_2", "hash_3", "hash_4", "hash_5"],
    "time_on_system_seconds": [25200, 18000, 28800, 21600, 27000],
    "idle_seconds": [1800, 900, 2700, 1200, 3000],
}


table = pa.Table.from_pydict(data, schema=schema)
uri = f"test_error_{deltalake.__version__}"
name = "test"
description = "This is a test table."
partitions = ["month_id"]
metadata = {"unique_constraint": json.dumps(["id"])}
storage_options = None

# create initial table
deltalake.DeltaTable.create(
    table_uri=uri,
    schema=schema,
    partition_by=partitions,
    storage_options=storage_options,
    name=name,
    description=description,
    mode="overwrite",
    custom_metadata=metadata,
    configuration={
        "delta.dataSkippingStatsColumns": "month_id",
        "delta.checkpoint.writeStatsAsStruct": "true",
        "delta.checkpoint.writeStatsAsJson": "true",
    },
)
print(f"Created {uri}")


def write_data():
    deltalake.write_deltalake(
        table_or_uri=uri,
        data=table,
        schema=schema,
        partition_by=partitions,
        storage_options=storage_options,
        name=name,
        description=description,
        custom_metadata=metadata,
        mode="overwrite",
        engine="rust",
    )


dt = deltalake.DeltaTable(uri)
print(dt.history())

# this fails
write_data()

Error:

--> 303 write_deltalake_rust(
    304     table_uri=table_uri,
    305     data=data,
    306     partition_by=partition_by,
    307     mode=mode,
    308     table=table._table if table is not None else None,
    309     schema_mode=schema_mode,
    310     predicate=predicate,
    311     name=name,
    312     description=description,
    313     configuration=configuration,
    314     storage_options=storage_options,
    315     writer_properties=writer_properties,
    316     custom_metadata=custom_metadata,
    317     post_commithook_properties=post_commithook_properties.__dict__
    318     if post_commithook_properties
    319     else None,
    320 )
    321 if table:
    322     table.update_incremental()

SchemaMismatchError: Schema error: Cannot merge metadata with different values for key description

More details:

This works if you downgrade back to 0.18.2:

# Installed 1 package in 7ms
# - deltalake==0.19.1
# + deltalake==0.18.2


 {'timestamp': 1725640131418,
  'operation': 'WRITE',
  'operationParameters': {'partitionBy': '["month_id"]', 'mode': 'Overwrite'},
  'clientVersion': 'delta-rs.0.18.1',
  'unique_constraint': '["id"]',
  'version': 2},
 {'timestamp': 1725640131413,
  'operation': 'WRITE',
  'operationParameters': {'partitionBy': '["month_id"]', 'mode': 'Overwrite'},
  'unique_constraint': '["id"]',
  'clientVersion': 'delta-rs.0.18.1',
  'version': 1},
 {'timestamp': 1725640131374,
  'operation': 'CREATE OR REPLACE TABLE',
  'operationParameters': {'protocol': '{"minReaderVersion":1,"minWriterVersion":2}',
   'mode': 'Overwrite',
   'metadata': '{"configuration":{"delta.checkpoint.writeStatsAsJson":"true","delta.checkpoint.writeStatsAsStruct":"true","delta.dataSkippingStatsColumns":"month_id"},"createdTime":1725640131371,"description":"This is a test table.","format":{"options":{},"provider":"parquet"},"id":"4715309f-90e1-428d-bc17-ec1d9a468fd7","name":"test","partitionColumns":["month_id"],"schemaString":"{\\"type\\":\\"struct\\",\\"fields\\":[{\\"name\\":\\"month_id\\",\\"type\\":\\"integer\\",\\"nullable\\":false,\\"metadata\\":{\\"description\\":\\"The month in integer format (YYYYMM)\\"}},{\\"name\\":\\"date_id\\",\\"type\\":\\"integer\\",\\"nullable\\":false,\\"metadata\\":{\\"description\\":\\"The date in integer format (YYYYMMDD)\\"}},{\\"name\\":\\"unique_row_hash\\",\\"type\\":\\"string\\",\\"nullable\\":false,\\"metadata\\":{\\"description\\":\\"A hash of the row to identify duplicates\\"}},{\\"name\\":\\"time_on_system_seconds\\",\\"type\\":\\"integer\\",\\"nullable\\":true,\\"metadata\\":{\\"description\\":\\"Total time on system in seconds\\"}},{\\"name\\":\\"idle_seconds\\",\\"type\\":\\"integer\\",\\"nullable\\":true,\\"metadata\\":{}}]}"}',
   'location': 'file://test_error_0.18.2'},
  'clientVersion': 'delta-rs.0.18.1',
  'unique_constraint': '["id"]',
  'version': 0}]

Metadata

Metadata

Assignees

No one assigned

    Labels

    binding/rustIssues for the Rust cratebugSomething isn't workingfix-awaiting-releaseIssues which have a fixed merged or pendingroad-to-1.0Going from zero to one

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions