Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 29 additions & 2 deletions pandas/core/dtypes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2308,8 +2308,35 @@ def kind(self) -> str:

@cache_readonly
def itemsize(self) -> int:
"""Return the number of bytes in this dtype"""
return self.numpy_dtype.itemsize
"""
Return the number of bytes in this dtype.
For Arrow-backed dtypes:
- Returns the fixed-width bit size divided by 8 for standard fixed-width types.
- For boolean types, returns the NumPy itemsize.
- Falls back to the NumPy dtype itemsize for variable-width & unsupported types.
Examples
--------
>>> import pyarrow as pa
>>> import pandas as pd
>>> dtype = pd.ArrowDtype(pa.int32())
>>> dtype.itemsize
4
>>> dtype = pd.ArrowDtype(pa.bool_())
>>> dtype.itemsize # falls back to numpy dtype
1
"""
if pa.types.is_boolean(self.pyarrow_dtype):
return self.numpy_dtype.itemsize

# Use pyarrow itemsize for fixed-width data types
# e.g. int32 -> 32 bits // 8 = 4 bytes
try:
return self.pyarrow_dtype.bit_width // 8
except (ValueError, AttributeError, NotImplementedError):
return self.numpy_dtype.itemsize

def construct_array_type(self) -> type_t[ArrowExtensionArray]:
"""
Expand Down
66 changes: 66 additions & 0 deletions pandas/tests/extension/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -3702,6 +3702,72 @@ def test_pow_with_all_na_float():
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
"type_name, expected_size",
[
# Integer types
("int8", 1),
("int16", 2),
("int32", 4),
("int64", 8),
("uint8", 1),
("uint16", 2),
("uint32", 4),
("uint64", 8),
# Floating point types
("float16", 2),
("float32", 4),
("float64", 8),
# Boolean
("bool_", 1),
# Date and timestamp types
("date32", 4),
("date64", 8),
("timestamp", 8),
# Time types
("time32", 4),
("time64", 8),
# Decimal types
("decimal128", 16),
("decimal256", 32),
],
)
def test_arrow_dtype_itemsize_fixed_width(type_name, expected_size):
# GH 57948

parametric_type_map = {
"timestamp": pa.timestamp("ns"),
"time32": pa.time32("s"),
"time64": pa.time64("ns"),
"decimal128": pa.decimal128(38, 10),
"decimal256": pa.decimal256(76, 10),
}

if type_name in parametric_type_map:
arrow_type = parametric_type_map.get(type_name)
else:
arrow_type = getattr(pa, type_name)()
dtype = ArrowDtype(arrow_type)

if type_name == "bool_":
expected_size = dtype.numpy_dtype.itemsize

assert dtype.itemsize == expected_size, (
f"{type_name} expected {expected_size}, got {dtype.itemsize} "
f"(bit_width={getattr(dtype.pyarrow_dtype, 'bit_width', 'N/A')})"
)


@pytest.mark.parametrize("type_name", ["string", "binary", "large_string"])
def test_arrow_dtype_itemsize_variable_width(type_name):
# GH 57948

arrow_type = getattr(pa, type_name)()
dtype = ArrowDtype(arrow_type)

assert dtype.itemsize == dtype.numpy_dtype.itemsize


def test_cast_pontwise_result_decimal_nan():
# GH#62522 we don't want to get back null[pyarrow] here
ser = pd.Series([], dtype="float64[pyarrow]")
Expand Down
Loading