<<< fixed pyarrow.RecordBatchFileReader

gdementen · gdementen · commit 3db401307608 · 2025-10-06T10:37:55.000+02:00
diff --git a/larray_editor/arrayadapter.py b/larray_editor/arrayadapter.py
@@ -381,6 +381,7 @@ def get_data_values_and_attributes(self, h_start, v_start, h_stop, v_stop):
         chunk_values = self.get_values(h_start, v_start, h_stop, v_stop)
         if isinstance(chunk_values, np.ndarray):
             assert chunk_values.ndim == 2
+            logger.debug(f"    {chunk_values.shape=}")
         elif isinstance(chunk_values, list) and len(chunk_values) == 0:
             chunk_values = [[]]
 
@@ -865,7 +866,8 @@ def update_finite_min_max_values(self, finite_values: np.ndarray,
         # per column => axis=0
         local_vmin = np.nanmin(finite_values, axis=0, initial=np.nan)
         local_vmax = np.nanmax(finite_values, axis=0, initial=np.nan)
-        assert local_vmin.shape == (h_stop - h_start,), f"unexpected shape: {local_vmin.shape}"
+        assert local_vmin.shape == (h_stop - h_start,), \
+            f"unexpected shape: {local_vmin.shape} ({finite_values.shape=}) vs {(h_stop - h_start,)} ({h_start=} {h_stop=})"
         # vmin or self.vmin can both be nan (if the whole section data
         # is/was nan)
         global_vmin = self.vmin
@@ -1819,7 +1821,7 @@ def get_values(self, h_start, v_start, h_stop, v_stop):
 #         return list(zip(*[col.to_pylist()
 #                           for col in self.data[v_start:v_stop].itercolumns()]))
 
-# contrary to other Path adapters, this one is both a File and Path adapter
+# Contrary to other Path adapters, this one is both a File *and* Path adapter
 # because it is more efficient to NOT keep the file open (because
 @adapter_for('pyarrow.RecordBatchFileReader')
 class FeatherFileAdapter(AbstractColumnarAdapter):
@@ -1860,6 +1862,18 @@ def _open_file(self, col_indices=None):
             assert isinstance(self.data, ipc.RecordBatchFileReader)
             return self.data
 
+    def _get_batches(self, start_batch, stop_batch, col_indices: list[int]) -> list:
+        """stop_batch is not included"""
+        logger.debug(f"FeatherFileAdapter._get_batches({start_batch}, "
+                     f"{stop_batch}, {col_indices})")
+        batch_indices = range(start_batch, stop_batch)
+        if isinstance(self.data, Path):
+            with self._open_file(col_indices=col_indices) as f:
+                return [f.get_batch(i) for i in batch_indices]
+        else:
+            return [self.data.get_batch(i).select(col_indices)
+                    for i in batch_indices]
+
     def shape2d(self):
         nrows = self._num_rows if self._num_rows is not None else self._estimated_num_rows
         return nrows, self._num_columns
@@ -1910,12 +1924,15 @@ def get_values(self, h_start, v_start, h_stop, v_stop):
         start_batch, stop_batch = np.searchsorted(self._batch_ends,
                                                   v=[v_start, v_stop - 1],
                                                   side='right')
+        # stop_batch is not included
+        stop_batch += 1
         chunk_start = self._batch_ends[start_batch - 1] if start_batch > 0 else 0
         col_indices = list(range(h_start, h_stop))
-        with self._open_file(col_indices=col_indices) as f:
-            batches = [f.get_batch(i)
-                       for i in range(start_batch, stop_batch + 1)]
+        batches = self._get_batches(start_batch, stop_batch, col_indices)
+        if len(batches) > 1:
             combined = pyarrow.concat_batches(batches)
+        else:
+            combined = batches[0]
         return combined[v_start - chunk_start:v_stop - chunk_start].to_pandas().values