Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 30 additions & 7 deletions tiledb/core.cc
Original file line number Diff line number Diff line change
Expand Up @@ -368,11 +368,11 @@ class PyAgg {
// Set the result data buffers
auto* res_buf = &result_buffers_[attr_name][agg_name];
if ("count" == agg_name || "null_count" == agg_name ||
"mean" == agg_name) {
"mean" == agg_name || "sum" == agg_name) {
// count and null_count use uint64 and mean uses float64
*res_buf = py::array(py::dtype("uint8"), 8);
} else {
// max, min, and sum use the dtype of the attribute
// for max and min use the dtype of the attribute
py::dtype dt(tiledb_dtype(attr.type(), attr.cell_size()));
*res_buf = py::array(py::dtype("uint8"), dt.itemsize());
}
Expand Down Expand Up @@ -509,11 +509,35 @@ class PyAgg {
if ("count" == agg_name || "null_count" == agg_name)
return py::cast(*((uint64_t*)agg_buf));

// Handle sum operation with upcasting to int64/uint64/double
// This has to be done according to
// https://github.com/TileDB-Inc/TileDB/blob/d3c3e5f6c3b53b9c9153861e56db7363804da5cc/tiledb/sm/query/readers/aggregators/sum_type.h
if ("sum" == agg_name) {
switch (attr.type()) {
case TILEDB_INT8:
case TILEDB_INT16:
case TILEDB_INT32:
case TILEDB_INT64:
return py::cast(*((int64_t*)agg_buf));
case TILEDB_UINT8:
case TILEDB_UINT16:
case TILEDB_UINT32:
case TILEDB_UINT64:
return py::cast(*((uint64_t*)agg_buf));
case TILEDB_FLOAT32:
case TILEDB_FLOAT64:
return py::cast(*((double*)agg_buf));
default:
TPY_ERROR_LOC(
"[_set_result] Invalid tiledb dtype for sum "
"aggregation result")
}
}

// Handle min/max operations with original data types
switch (attr.type()) {
case TILEDB_FLOAT32:
return py::cast(
"sum" == agg_name ? *((double*)agg_buf) :
*((float*)agg_buf));
return py::cast(*((float*)agg_buf));
case TILEDB_FLOAT64:
return py::cast(*((double*)agg_buf));
case TILEDB_INT8:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Based on the research I did, I have every reason to believe that this is the cause of the random segmentation faults we sometimes encounter during our daily test runs. For sum operations, the sum aggregation should return specific upcasted types source

Expand All @@ -534,8 +558,7 @@ class PyAgg {
return py::cast(*((uint64_t*)agg_buf));
default:
TPY_ERROR_LOC(
"[_cast_agg_result] Invalid tiledb dtype for aggregation "
"result")
"[_set_result] Invalid tiledb dtype for aggregation result")
}
}

Expand Down
131 changes: 84 additions & 47 deletions tiledb/tests/test_aggregates.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

class AggregateTest(DiskTestCase):
@pytest.mark.parametrize("sparse", [True, False])
@pytest.mark.parametrize("use_corner_cases", [False, True])
@pytest.mark.parametrize(
"dtype",
[
Expand All @@ -23,14 +24,31 @@ class AggregateTest(DiskTestCase):
np.float64,
],
)
def test_basic(self, sparse, dtype):
path = self.path("test_basic")
def test_basic(self, sparse, dtype, use_corner_cases):
"""Test aggregation with both random and corner case values."""
path = self.path(
"test_basic" if not use_corner_cases else "test_basic_corner_cases"
)
dom = tiledb.Domain(tiledb.Dim(name="d", domain=(0, 9), dtype=np.int32))
attrs = [tiledb.Attr(name="a", dtype=dtype)]
schema = tiledb.ArraySchema(domain=dom, attrs=attrs, sparse=sparse)
tiledb.Array.create(path, schema)

data = np.random.randint(1, 10, size=10)
def get_test_data(dtype, use_corner_cases, size=10):
"""Generate test data - either normal or corner case values."""
if use_corner_cases:
# For integer types smaller than 64-bit, use corner case values
if dtype in (np.uint8, np.uint16, np.uint32):
# Use maximum value for unsigned types
return np.array([np.iinfo(dtype).max] * size, dtype=dtype)
elif dtype in (np.int8, np.int16, np.int32):
# Use minimum value for signed types
return np.array([np.iinfo(dtype).min] * size, dtype=dtype)

# For float types and non-corner tests, use simple random data
return np.random.randint(1, 10, size=size).astype(dtype)

data = get_test_data(dtype, use_corner_cases)

with tiledb.open(path, "w") as A:
if sparse:
Expand All @@ -54,60 +72,79 @@ def test_basic(self, sparse, dtype):
with pytest.raises(NotImplementedError):
q.agg("count").df[:]

assert q.agg("sum")[:] == sum(expected)
assert q.agg("min")[:] == min(expected)
assert q.agg("max")[:] == max(expected)
assert q.agg("mean")[:] == sum(expected) / len(expected)
assert q.agg("count")[:] == len(expected)
# Test individual aggregations
sum_result = q.agg("sum")[:]
min_result = q.agg("min")[:]
max_result = q.agg("max")[:]
mean_result = q.agg("mean")[:]
count_result = q.agg("count")[:]

assert q.agg({"a": "sum"})[:] == sum(expected)
assert q.agg({"a": "min"})[:] == min(expected)
assert q.agg({"a": "max"})[:] == max(expected)
assert q.agg({"a": "mean"})[:] == sum(expected) / len(expected)
assert q.agg({"a": "count"})[:] == len(expected)
# Count should always work correctly
assert count_result == len(expected)

actual = q.agg(all_aggregates)[:]
assert actual["sum"] == sum(expected)
assert actual["min"] == min(expected)
assert actual["max"] == max(expected)
assert actual["mean"] == sum(expected) / len(expected)
assert actual["count"] == len(expected)
# Min/Max should return the correct values
assert min_result == min(expected)
assert max_result == max(expected)

actual = q.agg({"a": all_aggregates})[:]
assert actual["sum"] == sum(expected)
assert actual["min"] == min(expected)
assert actual["max"] == max(expected)
assert actual["mean"] == sum(expected) / len(expected)
assert actual["count"] == len(expected)
# For sum and mean, handle corner cases differently
expected_sum = np.sum(expected)
expected_mean = np.sum(expected) / len(expected)

# subarray
expected = A[4:7]["a"]
assert sum_result == expected_sum
assert mean_result == expected_mean

# Test dictionary-based aggregation
assert q.agg({"a": "sum"})[:] == sum_result
assert q.agg({"a": "min"})[:] == min_result
assert q.agg({"a": "max"})[:] == max_result
assert q.agg({"a": "count"})[:] == count_result

assert q.agg("sum")[4:7] == sum(expected)
assert q.agg("min")[4:7] == min(expected)
assert q.agg("max")[4:7] == max(expected)
assert q.agg("mean")[4:7] == sum(expected) / len(expected)
assert q.agg("count")[4:7] == len(expected)
actual = q.agg(all_aggregates)[:]
assert actual["sum"] == sum_result
assert actual["min"] == min_result
assert actual["max"] == max_result
assert actual["count"] == count_result

assert q.agg({"a": "sum"})[4:7] == sum(expected)
assert q.agg({"a": "min"})[4:7] == min(expected)
assert q.agg({"a": "max"})[4:7] == max(expected)
assert q.agg({"a": "mean"})[4:7] == sum(expected) / len(expected)
assert q.agg({"a": "count"})[4:7] == len(expected)
actual = q.agg({"a": all_aggregates})[:]
assert actual["sum"] == sum_result
assert actual["min"] == min_result
assert actual["max"] == max_result
assert actual["count"] == count_result

# subarray tests
expected_sub = A[4:7]["a"]

sub_sum = q.agg("sum")[4:7]
sub_min = q.agg("min")[4:7]
sub_max = q.agg("max")[4:7]
sub_mean = q.agg("mean")[4:7]
sub_count = q.agg("count")[4:7]

assert sub_count == len(expected_sub)
assert sub_min == min(expected_sub)
assert sub_max == max(expected_sub)
assert sub_sum == np.sum(expected_sub)
assert sub_mean == np.sum(expected_sub) / len(expected_sub)

assert q.agg({"a": "sum"})[4:7] == sub_sum
assert q.agg({"a": "min"})[4:7] == sub_min
assert q.agg({"a": "max"})[4:7] == sub_max
assert q.agg({"a": "mean"})[4:7] == sub_mean
assert q.agg({"a": "count"})[4:7] == sub_count

actual = q.agg(all_aggregates)[4:7]
assert actual["sum"] == sum(expected)
assert actual["min"] == min(expected)
assert actual["max"] == max(expected)
assert actual["mean"] == sum(expected) / len(expected)
assert actual["count"] == len(expected)
assert actual["sum"] == sub_sum
assert actual["min"] == sub_min
assert actual["max"] == sub_max
assert actual["mean"] == sub_mean
assert actual["count"] == sub_count

actual = q.agg({"a": all_aggregates})[4:7]
assert actual["sum"] == sum(expected)
assert actual["min"] == min(expected)
assert actual["max"] == max(expected)
assert actual["mean"] == sum(expected) / len(expected)
assert actual["count"] == len(expected)
assert actual["sum"] == sub_sum
assert actual["min"] == sub_min
assert actual["max"] == sub_max
assert actual["mean"] == sub_mean
assert actual["count"] == sub_count

@pytest.mark.parametrize("sparse", [True, False])
@pytest.mark.parametrize(
Expand Down
Loading