Skip to content

Commit 3db4013

Browse files
committed
<<< fixed pyarrow.RecordBatchFileReader
1 parent 5d17acc commit 3db4013

File tree

1 file changed

+22
-5
lines changed

1 file changed

+22
-5
lines changed

larray_editor/arrayadapter.py

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,7 @@ def get_data_values_and_attributes(self, h_start, v_start, h_stop, v_stop):
381381
chunk_values = self.get_values(h_start, v_start, h_stop, v_stop)
382382
if isinstance(chunk_values, np.ndarray):
383383
assert chunk_values.ndim == 2
384+
logger.debug(f" {chunk_values.shape=}")
384385
elif isinstance(chunk_values, list) and len(chunk_values) == 0:
385386
chunk_values = [[]]
386387

@@ -865,7 +866,8 @@ def update_finite_min_max_values(self, finite_values: np.ndarray,
865866
# per column => axis=0
866867
local_vmin = np.nanmin(finite_values, axis=0, initial=np.nan)
867868
local_vmax = np.nanmax(finite_values, axis=0, initial=np.nan)
868-
assert local_vmin.shape == (h_stop - h_start,), f"unexpected shape: {local_vmin.shape}"
869+
assert local_vmin.shape == (h_stop - h_start,), \
870+
f"unexpected shape: {local_vmin.shape} ({finite_values.shape=}) vs {(h_stop - h_start,)} ({h_start=} {h_stop=})"
869871
# vmin or self.vmin can both be nan (if the whole section data
870872
# is/was nan)
871873
global_vmin = self.vmin
@@ -1819,7 +1821,7 @@ def get_values(self, h_start, v_start, h_stop, v_stop):
18191821
# return list(zip(*[col.to_pylist()
18201822
# for col in self.data[v_start:v_stop].itercolumns()]))
18211823

1822-
# contrary to other Path adapters, this one is both a File and Path adapter
1824+
# Contrary to other Path adapters, this one is both a File *and* Path adapter
18231825
# because it is more efficient to NOT keep the file open (because
18241826
@adapter_for('pyarrow.RecordBatchFileReader')
18251827
class FeatherFileAdapter(AbstractColumnarAdapter):
@@ -1860,6 +1862,18 @@ def _open_file(self, col_indices=None):
18601862
assert isinstance(self.data, ipc.RecordBatchFileReader)
18611863
return self.data
18621864

1865+
def _get_batches(self, start_batch, stop_batch, col_indices: list[int]) -> list:
1866+
"""stop_batch is not included"""
1867+
logger.debug(f"FeatherFileAdapter._get_batches({start_batch}, "
1868+
f"{stop_batch}, {col_indices})")
1869+
batch_indices = range(start_batch, stop_batch)
1870+
if isinstance(self.data, Path):
1871+
with self._open_file(col_indices=col_indices) as f:
1872+
return [f.get_batch(i) for i in batch_indices]
1873+
else:
1874+
return [self.data.get_batch(i).select(col_indices)
1875+
for i in batch_indices]
1876+
18631877
def shape2d(self):
18641878
nrows = self._num_rows if self._num_rows is not None else self._estimated_num_rows
18651879
return nrows, self._num_columns
@@ -1910,12 +1924,15 @@ def get_values(self, h_start, v_start, h_stop, v_stop):
19101924
start_batch, stop_batch = np.searchsorted(self._batch_ends,
19111925
v=[v_start, v_stop - 1],
19121926
side='right')
1927+
# stop_batch is not included
1928+
stop_batch += 1
19131929
chunk_start = self._batch_ends[start_batch - 1] if start_batch > 0 else 0
19141930
col_indices = list(range(h_start, h_stop))
1915-
with self._open_file(col_indices=col_indices) as f:
1916-
batches = [f.get_batch(i)
1917-
for i in range(start_batch, stop_batch + 1)]
1931+
batches = self._get_batches(start_batch, stop_batch, col_indices)
1932+
if len(batches) > 1:
19181933
combined = pyarrow.concat_batches(batches)
1934+
else:
1935+
combined = batches[0]
19191936
return combined[v_start - chunk_start:v_stop - chunk_start].to_pandas().values
19201937

19211938

0 commit comments

Comments
 (0)