apache · gruuya · Aug 1, 2023 · Aug 2, 2023 · Aug 3, 2023 · Aug 3, 2023
diff --git a/datafusion/core/src/physical_plan/sorts/sort.rs b/datafusion/core/src/physical_plan/sorts/sort.rs
@@ -87,8 +87,7 @@ impl ExternalSorterMetrics {
 /// 3. when input is exhausted, merge all in memory batches and spills to get a total order.
 struct ExternalSorter {
     schema: SchemaRef,
-    in_mem_batches: Vec<RecordBatch>,
-    in_mem_batches_sorted: bool,
+    in_mem_batches: Vec<(bool, RecordBatch)>,
     spills: Vec<NamedTempFile>,
     /// Sort expressions
     expr: Arc<[PhysicalSortExpr]>,
@@ -118,7 +117,6 @@ impl ExternalSorter {
         Self {
             schema,
             in_mem_batches: vec![],
-            in_mem_batches_sorted: true,
             spills: vec![],
             expr: expr.into(),
             metrics,
@@ -133,11 +131,28 @@ impl ExternalSorter {
     /// Appends an unsorted [`RecordBatch`] to `in_mem_batches`
     ///
     /// Updates memory usage metrics, and possibly triggers spilling to disk
-    async fn insert_batch(&mut self, input: RecordBatch) -> Result<()> {
+    async fn insert_batch(&mut self, mut input: RecordBatch) -> Result<()> {
         if input.num_rows() == 0 {
             return Ok(());
         }
 
+        let mut batch_sorted = false;
+        if self.fetch.map_or(false, |f| f < input.num_rows()) {
+            // Eagerly sort the batch to potentially reduce the number of rows
+            // after applying the fetch parameter; first perform a memory reservation
+            // for the sorting procedure.
+            let mut reservation =
+                MemoryConsumer::new(format!("insert_batch{}", self.partition_id))
+                    .register(&self.runtime.memory_pool);
+
+            // TODO: This should probably be try_grow (#5885)
+            reservation.resize(input.get_array_memory_size());
+            // Maybe we should perform sorting in a parallel task to unblock the caller
+            input = sort_batch(&input, &self.expr, self.fetch)?;
+            reservation.free();
+            batch_sorted = true;
+        }
+
         let size = batch_byte_size(&input);
         if self.reservation.try_grow(size).is_err() {
             let before = self.reservation.size();
@@ -159,8 +174,7 @@ impl ExternalSorter {
             }
         }
 
-        self.in_mem_batches.push(input);
-        self.in_mem_batches_sorted = false;
+        self.in_mem_batches.push((batch_sorted, input));
         Ok(())
     }
 
@@ -224,7 +238,11 @@ impl ExternalSorter {
         self.in_mem_sort().await?;
 
         let spillfile = self.runtime.disk_manager.create_tmp_file("Sorting")?;
-        let batches = std::mem::take(&mut self.in_mem_batches);
+
+        let (sorted, batches): (Vec<bool>, Vec<RecordBatch>) =
+            std::mem::take(&mut self.in_mem_batches).into_iter().unzip();
+        assert!(sorted.iter().all(|&s| s));
+
         spill_sorted_batches(batches, spillfile.path(), self.schema.clone()).await?;
         let used = self.reservation.free();
         self.metrics.spill_count.add(1);
@@ -235,23 +253,25 @@ impl ExternalSorter {
 
     /// Sorts the in_mem_batches in place
     async fn in_mem_sort(&mut self) -> Result<()> {
-        if self.in_mem_batches_sorted {
+        if self.in_mem_batches.iter().all(|(sorted, _)| *sorted) {
             return Ok(());
         }
 
         self.in_mem_batches = self
             .in_mem_sort_stream(self.metrics.baseline.intermediate())?
-            .try_collect()
-            .await?;
+            .try_collect::<Vec<_>>()
+            .await?
+            .into_iter()
+            .map(|batch| (true, batch))
+            .collect();
 
         let size: usize = self
             .in_mem_batches
             .iter()
-            .map(|x| x.get_array_memory_size())
+            .map(|(_, x)| x.get_array_memory_size())
             .sum();
 
         self.reservation.resize(size);
-        self.in_mem_batches_sorted = true;
         Ok(())
     }
 
@@ -262,8 +282,8 @@ impl ExternalSorter {
     ) -> Result<SendableRecordBatchStream> {
         assert_ne!(self.in_mem_batches.len(), 0);
         if self.in_mem_batches.len() == 1 {
-            let batch = self.in_mem_batches.remove(0);
-            let stream = self.sort_batch_stream(batch, metrics)?;
+            let (sorted, batch) = self.in_mem_batches.remove(0);
+            let stream = self.sort_batch_stream(batch, sorted, metrics)?;
             self.in_mem_batches.clear();
             return Ok(stream);
         }
@@ -273,16 +293,23 @@ impl ExternalSorter {
         // This is a very rough heuristic and likely could be refined further
         if self.reservation.size() < 1048576 {
             // Concatenate memory batches together and sort
-            let batch = concat_batches(&self.schema, &self.in_mem_batches)?;
+            let (_, batches): (Vec<bool>, Vec<RecordBatch>) =
+                std::mem::take(&mut self.in_mem_batches).into_iter().unzip();
+            let batch = concat_batches(&self.schema, &batches)?;
             self.in_mem_batches.clear();
-            return self.sort_batch_stream(batch, metrics);
+            // Even if all individual batches were themselves sorted the resulting concatenated one
+            // isn't guaranteed to be sorted, so we must perform sorting on the stream.
+            return self.sort_batch_stream(batch, false, metrics);
         }
 
         let streams = std::mem::take(&mut self.in_mem_batches)
             .into_iter()
-            .map(|batch| {
+            .map(|(sorted, batch)| {
                 let metrics = self.metrics.baseline.intermediate();
-                Ok(spawn_buffered(self.sort_batch_stream(batch, metrics)?, 1))
+                Ok(spawn_buffered(
+                    self.sort_batch_stream(batch, sorted, metrics)?,
+                    1,
+                ))
             })
             .collect::<Result<_>>()?;
 
@@ -299,27 +326,34 @@ impl ExternalSorter {
     fn sort_batch_stream(
         &self,
         batch: RecordBatch,
+        sorted: bool,
         metrics: BaselineMetrics,
     ) -> Result<SendableRecordBatchStream> {
         let schema = batch.schema();
 
-        let mut reservation =
-            MemoryConsumer::new(format!("sort_batch_stream{}", self.partition_id))
-                .register(&self.runtime.memory_pool);
-
-        // TODO: This should probably be try_grow (#5885)
-        reservation.resize(batch.get_array_memory_size());
-
-        let fetch = self.fetch;
-        let expressions = self.expr.clone();
-        let stream = futures::stream::once(futures::future::lazy(move |_| {
-            let sorted = sort_batch(&batch, &expressions, fetch)?;
-            metrics.record_output(sorted.num_rows());
-            drop(batch);
-            reservation.free();
-            Ok(sorted)
-        }));
-        Ok(Box::pin(RecordBatchStreamAdapter::new(schema, stream)))
+        if !sorted {
+            // Reserve some memory for sorting the batch
+            let mut reservation =
+                MemoryConsumer::new(format!("sort_batch_stream{}", self.partition_id))
+                    .register(&self.runtime.memory_pool);
+
+            // TODO: This should probably be try_grow (#5885)
+            reservation.resize(batch.get_array_memory_size());
+
+            let fetch = self.fetch;
+            let expressions = self.expr.clone();
+            let stream = futures::stream::once(futures::future::lazy(move |_| {
+                let output = sort_batch(&batch, &expressions, fetch)?;
+                metrics.record_output(output.num_rows());
+                drop(batch);
+                reservation.free();
+                Ok(output)
+            }));
+            Ok(Box::pin(RecordBatchStreamAdapter::new(schema, stream)))
+        } else {
+            let stream = futures::stream::once(futures::future::lazy(move |_| Ok(batch)));
+            Ok(Box::pin(RecordBatchStreamAdapter::new(schema, stream)))
+        }
     }
 }