apache · gruuya · Aug 1, 2023 · Aug 2, 2023 · Aug 3, 2023 · Aug 3, 2023
diff --git a/datafusion/core/src/physical_plan/sorts/sort.rs b/datafusion/core/src/physical_plan/sorts/sort.rs
@@ -85,6 +85,10 @@ impl ExternalSorterMetrics {
 ///
 /// 1. get a non-empty new batch from input
 ///
+/// 1.2. if a `fetch` parameter has been provided, and the batch size
+///      is larger than `fetch`, sort the incoming batch in order to
+///      reduce its size and thus use less memory.
+///
 /// 2. check with the memory manager there is sufficient space to
 ///   buffer the batch in memory 2.1 if memory sufficient, buffer
 ///   batch in memory, go to 1.
@@ -196,9 +200,13 @@ impl ExternalSorterMetrics {
 struct ExternalSorter {
     /// schema of the output (and the input)
     schema: SchemaRef,
-    /// Potentially unsorted in memory buffer
-    in_mem_batches: Vec<RecordBatch>,
-    /// if `Self::in_mem_batches` are sorted
+    /// A vector of tuples, with each tuple consisting of a flag
+    /// denoting whether the batch is sorted, and the batch itself
+    in_mem_batches: Vec<(bool, RecordBatch)>,
+    /// A flag denoting whether the inter-batch order is guaranteed;
+    /// note that this is a stronger signal than just having all
+    /// individual batches sorted—it means that we can stream the
+    /// entire vector of batches inside one stream for the merge-sort
     in_mem_batches_sorted: bool,
     /// If data has previously been spilled, the locations of the
     /// spill files (in Arrow IPC format)
@@ -238,7 +246,7 @@ impl ExternalSorter {
         Self {
             schema,
             in_mem_batches: vec![],
-            in_mem_batches_sorted: true,
+            in_mem_batches_sorted: false,
             spills: vec![],
             expr: expr.into(),
             metrics,
@@ -253,11 +261,23 @@ impl ExternalSorter {
     /// Appends an unsorted [`RecordBatch`] to `in_mem_batches`
     ///
     /// Updates memory usage metrics, and possibly triggers spilling to disk
-    async fn insert_batch(&mut self, input: RecordBatch) -> Result<()> {
+    async fn insert_batch(&mut self, mut input: RecordBatch) -> Result<()> {
         if input.num_rows() == 0 {
             return Ok(());
         }
 
+        let mut batch_sorted = false;
+        if self
+            .fetch
+            .map_or(false, |f| f <= input.num_rows() && f <= 100)
+        {
+            // Eagerly sort the batch to potentially reduce the number of rows
+            // after applying the fetch parameter.
+            // Currently only applied for fetch of 100 rows or less.
+            input = sort_batch(&input, &self.expr, self.fetch)?;
+            batch_sorted = true;
+        }
+
         let size = batch_byte_size(&input);
         if self.reservation.try_grow(size).is_err() {
             let before = self.reservation.size();
@@ -279,8 +299,8 @@ impl ExternalSorter {
             }
         }
 
-        self.in_mem_batches.push(input);
-        self.in_mem_batches_sorted = false;
+        self.in_mem_batches.push((batch_sorted, input));
+        self.in_mem_batches_sorted = batch_sorted && self.in_mem_batches.len() == 1;
         Ok(())
     }
 
@@ -345,7 +365,7 @@ impl ExternalSorter {
     }
 
     /// Writes any `in_memory_batches` to a spill file and clears
-    /// the batches. The contents of the spil file are sorted.
+    /// the batches. The contents of the spill file are sorted.
     ///
     /// Returns the amount of memory freed.
     async fn spill(&mut self) -> Result<usize> {
@@ -359,7 +379,11 @@ impl ExternalSorter {
         self.in_mem_sort().await?;
 
         let spillfile = self.runtime.disk_manager.create_tmp_file("Sorting")?;
-        let batches = std::mem::take(&mut self.in_mem_batches);
+
+        let (sorted, batches): (Vec<bool>, Vec<RecordBatch>) =
+            std::mem::take(&mut self.in_mem_batches).into_iter().unzip();
+        assert!(sorted.iter().all(|&s| s));
+
         spill_sorted_batches(batches, spillfile.path(), self.schema.clone()).await?;
         let used = self.reservation.free();
         self.metrics.spill_count.add(1);
@@ -370,23 +394,27 @@ impl ExternalSorter {
 
     /// Sorts the in_mem_batches in place
     async fn in_mem_sort(&mut self) -> Result<()> {
-        if self.in_mem_batches_sorted {
+        if self.in_mem_batches.is_empty() || self.in_mem_batches_sorted {
             return Ok(());
         }
 
         self.in_mem_batches = self
             .in_mem_sort_stream(self.metrics.baseline.intermediate())?
-            .try_collect()
-            .await?;
+            .try_collect::<Vec<_>>()
+            .await?
+            .into_iter()
+            .map(|batch| (true, batch))
+            .collect();
+        // We're now also guaranteed that the inter-batch order holds
+        self.in_mem_batches_sorted = true;
 
         let size: usize = self
             .in_mem_batches
             .iter()
-            .map(|x| x.get_array_memory_size())
+            .map(|(_, x)| x.get_array_memory_size())
             .sum();
 
         self.reservation.resize(size);
-        self.in_mem_batches_sorted = true;
         Ok(())
     }
 
@@ -454,27 +482,33 @@ impl ExternalSorter {
     ) -> Result<SendableRecordBatchStream> {
         assert_ne!(self.in_mem_batches.len(), 0);
         if self.in_mem_batches.len() == 1 {
-            let batch = self.in_mem_batches.remove(0);
-            let stream = self.sort_batch_stream(batch, metrics)?;
+            let (sorted, batch) = self.in_mem_batches.remove(0);
+            let stream = self.sort_batch_stream(batch, sorted, metrics)?;
             self.in_mem_batches.clear();
             return Ok(stream);
         }
 
-        // If less than 1MB of in-memory data, concatenate and sort in place
+        // If less than 1MB of in-memory data and no batch is sorted, concatenate and sort in place
         //
         // This is a very rough heuristic and likely could be refined further
-        if self.reservation.size() < 1048576 {
+        let no_batches_sorted = !self.in_mem_batches.iter().any(|(sorted, _)| *sorted);
+        if self.reservation.size() < 1048576 && no_batches_sorted {
             // Concatenate memory batches together and sort
-            let batch = concat_batches(&self.schema, &self.in_mem_batches)?;
+            let (_, batches): (Vec<bool>, Vec<RecordBatch>) =
+                std::mem::take(&mut self.in_mem_batches).into_iter().unzip();
+            let batch = concat_batches(&self.schema, &batches)?;
             self.in_mem_batches.clear();
-            return self.sort_batch_stream(batch, metrics);
+            return self.sort_batch_stream(batch, false, metrics);
         }
 
         let streams = std::mem::take(&mut self.in_mem_batches)
             .into_iter()
-            .map(|batch| {
+            .map(|(sorted, batch)| {
                 let metrics = self.metrics.baseline.intermediate();
-                Ok(spawn_buffered(self.sort_batch_stream(batch, metrics)?, 1))
+                Ok(spawn_buffered(
+                    self.sort_batch_stream(batch, sorted, metrics)?,
+                    1,
+                ))
             })
             .collect::<Result<_>>()?;
 
@@ -492,27 +526,34 @@ impl ExternalSorter {
     fn sort_batch_stream(
         &self,
         batch: RecordBatch,
+        sorted: bool,
         metrics: BaselineMetrics,
     ) -> Result<SendableRecordBatchStream> {
         let schema = batch.schema();
 
-        let mut reservation =
-            MemoryConsumer::new(format!("sort_batch_stream{}", self.partition_id))
-                .register(&self.runtime.memory_pool);
-
-        // TODO: This should probably be try_grow (#5885)
-        reservation.resize(batch.get_array_memory_size());
-
-        let fetch = self.fetch;
-        let expressions = self.expr.clone();
-        let stream = futures::stream::once(futures::future::lazy(move |_| {
-            let sorted = sort_batch(&batch, &expressions, fetch)?;
-            metrics.record_output(sorted.num_rows());
-            drop(batch);
-            reservation.free();
-            Ok(sorted)
-        }));
-        Ok(Box::pin(RecordBatchStreamAdapter::new(schema, stream)))
+        if !sorted {
+            // Reserve some memory for sorting the batch
+            let mut reservation =
+                MemoryConsumer::new(format!("sort_batch_stream{}", self.partition_id))
+                    .register(&self.runtime.memory_pool);
+
+            // TODO: This should probably be try_grow (#5885)
+            reservation.resize(batch.get_array_memory_size());
+
+            let fetch = self.fetch;
+            let expressions = self.expr.clone();
+            let stream = futures::stream::once(futures::future::lazy(move |_| {
+                let output = sort_batch(&batch, &expressions, fetch)?;
+                metrics.record_output(output.num_rows());
+                drop(batch);
+                reservation.free();
+                Ok(output)
+            }));
+            Ok(Box::pin(RecordBatchStreamAdapter::new(schema, stream)))
+        } else {
+            let stream = futures::stream::once(futures::future::lazy(move |_| Ok(batch)));
+            Ok(Box::pin(RecordBatchStreamAdapter::new(schema, stream)))
+        }
     }
 }
 
@@ -562,22 +603,6 @@ async fn spill_sorted_batches(
     }
 }
 
-fn read_spill_as_stream(
-    path: NamedTempFile,
-    schema: SchemaRef,
-) -> Result<SendableRecordBatchStream> {
-    let mut builder = RecordBatchReceiverStream::builder(schema, 2);
-    let sender = builder.tx();
-
-    builder.spawn_blocking(move || {
-        if let Err(e) = read_spill(sender, path.path()) {
-            error!("Failure while reading spill file: {:?}. Error: {}", path, e);
-        }
-    });
-
-    Ok(builder.build())
-}
-
 fn write_sorted(
     batches: Vec<RecordBatch>,
     path: PathBuf,
@@ -597,6 +622,23 @@ fn write_sorted(
     Ok(())
 }
 
+/// Stream batches from spill files inside a single stream.
+fn read_spill_as_stream(
+    path: NamedTempFile,
+    schema: SchemaRef,
+) -> Result<SendableRecordBatchStream> {
+    let mut builder = RecordBatchReceiverStream::builder(schema, 2);
+    let sender = builder.tx();
+
+    builder.spawn_blocking(move || {
+        if let Err(e) = read_spill(sender, path.path()) {
+            error!("Failure while reading spill file: {:?}. Error: {}", path, e);
+        }
+    });
+
+    Ok(builder.build())
+}
+
 fn read_spill(sender: Sender<Result<RecordBatch>>, path: &Path) -> Result<()> {
     let file = BufReader::new(File::open(path)?);
     let reader = FileReader::try_new(file, None)?;