diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml
index f17456e3dd88..57f53601d1aa 100644
--- a/datafusion/core/Cargo.toml
+++ b/datafusion/core/Cargo.toml
@@ -41,7 +41,7 @@ path = "src/lib.rs"
 # Used to enable the avro format
 avro = ["avro-rs", "num-traits", "datafusion-common/avro"]
 crypto_expressions = ["datafusion-physical-expr/crypto_expressions"]
-default = ["crypto_expressions", "regex_expressions", "unicode_expressions"]
+default = ["crypto_expressions", "regex_expressions", "unicode_expressions", "row"]
 # Used for testing ONLY: causes all values to hash to the same value (test for collisions)
 force_hash_collisions = []
 # Used to enable JIT code generation
diff --git a/datafusion/core/src/physical_plan/sorts/sort.rs b/datafusion/core/src/physical_plan/sorts/sort.rs
index 67a6e5fec244..748af1b0a7ee 100644
--- a/datafusion/core/src/physical_plan/sorts/sort.rs
+++ b/datafusion/core/src/physical_plan/sorts/sort.rs
@@ -25,7 +25,7 @@ use crate::execution::memory_manager::{
     human_readable_size, ConsumerType, MemoryConsumer, MemoryConsumerId, MemoryManager,
 };
 use crate::execution::runtime_env::RuntimeEnv;
-use crate::physical_plan::common::{batch_byte_size, IPCWriter, SizedRecordBatchStream};
+use crate::physical_plan::common::{IPCWriter, SizedRecordBatchStream};
 use crate::physical_plan::expressions::PhysicalSortExpr;
 use crate::physical_plan::metrics::{
     BaselineMetrics, CompositeMetricsSet, MemTrackingMetrics, MetricsSet,
@@ -34,32 +34,199 @@ use crate::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeStrea
 use crate::physical_plan::sorts::SortedStream;
 use crate::physical_plan::stream::RecordBatchReceiverStream;
 use crate::physical_plan::{
-    common, DisplayFormatType, Distribution, EmptyRecordBatchStream, ExecutionPlan,
-    Partitioning, SendableRecordBatchStream, Statistics,
+    DisplayFormatType, Distribution, EmptyRecordBatchStream, ExecutionPlan, Partitioning,
+    RecordBatchStream, SendableRecordBatchStream, Statistics,
 };
 use crate::prelude::SessionConfig;
-use arrow::array::ArrayRef;
+use crate::row::reader::{read_as_batch, read_row, MutableRecordBatch, RowReader};
+use crate::row::row_supported;
+use crate::row::writer::{write_row, RowWriter};
+use arrow::array::{Array, ArrayRef, UInt32Array};
 pub use arrow::compute::SortOptions;
-use arrow::compute::{lexsort_to_indices, take, SortColumn, TakeOptions};
+use arrow::compute::{concat, lexsort_to_indices, take, SortColumn, TakeOptions};
 use arrow::datatypes::SchemaRef;
 use arrow::error::Result as ArrowResult;
 use arrow::ipc::reader::FileReader;
 use arrow::record_batch::RecordBatch;
 use async_trait::async_trait;
 use futures::lock::Mutex;
-use futures::StreamExt;
+use futures::{Stream, StreamExt};
 use log::{debug, error};
 use std::any::Any;
+use std::cmp::min;
+use std::collections::VecDeque;
 use std::fmt;
 use std::fmt::{Debug, Formatter};
 use std::fs::File;
 use std::io::BufReader;
 use std::path::{Path, PathBuf};
 use std::sync::Arc;
+use std::task::{Context, Poll};
 use tempfile::NamedTempFile;
 use tokio::sync::mpsc::{Receiver, Sender};
 use tokio::task;
 
+use crate::row::{estimate_row_width, fixed_size};
+
+const DEFAULT_PAGE_SIZE: usize = 8 * 1024 * 1024; // 8MB by default
+
+/// Batch composed of raw-bytes rows
+pub struct RowWiseBatch {
+    data: Vec<u8>,
+    next_record_offset: usize,
+    capacity: usize,
+    // use u32 since it's capable of indexing ~4GB page
+    offsets: Vec<u32>,
+    schema: SchemaRef,
+}
+
+impl RowWiseBatch {
+    /// new
+    pub fn new(schema: SchemaRef) -> Self {
+        let row_width = estimate_row_width(&schema);
+        let mut estimate_row_num = DEFAULT_PAGE_SIZE / row_width;
+        if !fixed_size(&schema) {
+            // Avoid reallocating `offsets` vector if possible
+            estimate_row_num = (estimate_row_num as f32 * 1.2) as usize;
+        }
+        Self {
+            data: vec![0; DEFAULT_PAGE_SIZE],
+            next_record_offset: 0,
+            capacity: DEFAULT_PAGE_SIZE,
+            offsets: Vec::with_capacity(estimate_row_num),
+            schema,
+        }
+    }
+
+    /// offsets
+    pub fn offsets(&self) -> Vec<usize> {
+        self.offsets.iter().map(|o| *o as usize).collect::<Vec<_>>()
+    }
+
+    /// Tell if the batch has enough free space for the coming record.
+    pub fn has_enough_space(&self, size_to_append: usize) -> bool {
+        size_to_append + self.next_record_offset <= self.capacity
+    }
+
+    /// Append current row to the batch
+    pub fn append(&mut self, writer: &RowWriter, row_width: usize) {
+        self.offsets.push(self.next_record_offset as u32);
+        let row = writer.get_row();
+        self.data[self.next_record_offset..self.next_record_offset + row_width]
+            .copy_from_slice(row);
+        self.next_record_offset += row_width;
+    }
+
+    /// Total number of rows in this batch
+    pub fn num_rows(&self) -> usize {
+        self.offsets.len()
+    }
+}
+
+/// In-memory buffered batches
+/// TODO: support both columnar and row-wise batch
+struct BufferedBatches {
+    batches: VecDeque<RowWiseBatch>,
+    sorted_arrays: Vec<Vec<ArrayRef>>,
+}
+
+impl BufferedBatches {
+    fn new() -> Self {
+        BufferedBatches {
+            batches: VecDeque::new(),
+            sorted_arrays: vec![],
+        }
+    }
+
+    /// Number of inserted batches
+    fn len(&self) -> usize {
+        self.sorted_arrays.len()
+    }
+
+    /// Read current buffered records as a columnar-wise batch
+    fn read_out(&mut self) -> Result<Vec<RecordBatch>> {
+        self.sorted_arrays = vec![];
+        self.batches
+            .drain(..)
+            .map(|batch| {
+                read_as_batch(&batch.data, batch.schema.clone(), &batch.offsets())
+            })
+            .collect::<Result<Vec<_>>>()
+    }
+
+    /// Consume all contents out to prepare a sort
+    fn read_out_all(
+        &mut self,
+    ) -> (Vec<Vec<ArrayRef>>, Vec<RowWiseBatch>, Vec<CompositeIndex>) {
+        let sorted_arrays = self.sorted_arrays.drain(..).collect::<Vec<_>>();
+        let batches = self.batches.drain(..).collect::<Vec<_>>();
+        let row_indices = batches
+            .iter()
+            .enumerate()
+            .flat_map(|(i, batch)| {
+                (0..batch.num_rows()).map(move |r| CompositeIndex {
+                    // since we original use UInt32Array to index the combined mono batch,
+                    // component record batches won't overflow as well,
+                    // use u32 here for space efficiency.
+                    batch_idx: i as u32,
+                    row_idx: r as u32,
+                })
+            })
+            .collect::<Vec<CompositeIndex>>();
+
+        (sorted_arrays, batches, row_indices)
+    }
+
+    /// Append all records in input batch with sort array into this buffer
+    async fn append(
+        &mut self,
+        sorter: &ExternalSorter,
+        input: BatchWithSortArray,
+    ) -> Result<()> {
+        let BatchWithSortArray { sort_arrays, batch } = input;
+        self.sorted_arrays.push(sort_arrays);
+
+        let schema = &batch.schema();
+        let mut writer = RowWriter::new(schema);
+        for cur_row in 0..batch.num_rows() {
+            let row_width = write_row(&mut writer, cur_row, schema, batch.columns());
+            let to_append = self
+                .batch_to_append(sorter, row_width, schema.clone())
+                .await?;
+            to_append.append(&writer, row_width);
+            writer.reset();
+        }
+        Ok(())
+    }
+
+    /// Get or create a new fixed-sized raw-bytes-backed batch to append the record
+    async fn batch_to_append(
+        &mut self,
+        sorter: &ExternalSorter,
+        row_width: usize,
+        schema: SchemaRef,
+    ) -> Result<&mut RowWiseBatch> {
+        if self.batches.is_empty() {
+            sorter.try_grow(DEFAULT_PAGE_SIZE).await?;
+            sorter.metrics.mem_used().add(DEFAULT_PAGE_SIZE);
+            let new = RowWiseBatch::new(schema);
+            self.batches.push_back(new);
+            Ok(self.batches.back_mut().unwrap())
+        } else {
+            let back = self.batches.back().unwrap();
+            if back.has_enough_space(row_width) {
+                Ok(self.batches.back_mut().unwrap())
+            } else {
+                sorter.try_grow(DEFAULT_PAGE_SIZE).await?;
+                sorter.metrics.mem_used().add(DEFAULT_PAGE_SIZE);
+                let new = RowWiseBatch::new(schema);
+                self.batches.push_back(new);
+                Ok(self.batches.back_mut().unwrap())
+            }
+        }
+    }
+}
+
 /// Sort arbitrary size of data to get a total order (may spill several times during sorting based on free memory available).
 ///
 /// The basic architecture of the algorithm:
@@ -72,7 +239,7 @@ use tokio::task;
 struct ExternalSorter {
     id: MemoryConsumerId,
     schema: SchemaRef,
-    in_mem_batches: Mutex<Vec<RecordBatch>>,
+    in_mem_batches: Mutex<BufferedBatches>,
     spills: Mutex<Vec<NamedTempFile>>,
     /// Sort expressions
     expr: Vec<PhysicalSortExpr>,
@@ -80,6 +247,7 @@ struct ExternalSorter {
     runtime: Arc<RuntimeEnv>,
     metrics_set: CompositeMetricsSet,
     metrics: BaselineMetrics,
+    row_buffer_supported: bool,
 }
 
 impl ExternalSorter {
@@ -92,26 +260,46 @@ impl ExternalSorter {
         runtime: Arc<RuntimeEnv>,
     ) -> Self {
         let metrics = metrics_set.new_intermediate_baseline(partition_id);
+        // TODO by-pass single or few column row for efficiency
+        let row_buffer_supported = row_supported(&schema);
         Self {
             id: MemoryConsumerId::new(partition_id),
             schema,
-            in_mem_batches: Mutex::new(vec![]),
+            in_mem_batches: Mutex::new(BufferedBatches::new()),
             spills: Mutex::new(vec![]),
             expr,
             session_config,
             runtime,
             metrics_set,
             metrics,
+            row_buffer_supported,
         }
     }
 
-    async fn insert_batch(&self, input: RecordBatch) -> Result<()> {
+    async fn insert_batch(
+        &self,
+        input: RecordBatch,
+        tracking_metrics: &MemTrackingMetrics,
+    ) -> Result<()> {
         if input.num_rows() > 0 {
-            let size = batch_byte_size(&input);
-            self.try_grow(size).await?;
-            self.metrics.mem_used().add(size);
-            let mut in_mem_batches = self.in_mem_batches.lock().await;
-            in_mem_batches.push(input);
+            if self.row_buffer_supported {
+                let _timer = tracking_metrics.elapsed_compute().timer();
+                let partial = sort_batch(input, self.schema.clone(), &self.expr)?;
+                drop(_timer);
+                let mut in_mem_batches = self.in_mem_batches.lock().await;
+                in_mem_batches.append(self, partial).await?;
+            } else {
+                // TODO: handle columnar row batch here
+                // let size = batch_byte_size(&input);
+                // self.try_grow(size).await?;
+                // self.metrics.mem_used().add(size);
+                // let mut in_mem_batches = self.in_mem_batches.lock().await;
+                // // NB timer records time taken on drop, so there are no
+                // // calls to `timer.done()` below.
+                // let _timer = tracking_metrics.elapsed_compute().timer();
+                // let partial = sort_batch(input, self.schema.clone(), &self.expr)?;
+                // in_mem_batches.push(partial);
+            }
         }
         Ok(())
     }
@@ -124,6 +312,7 @@ impl ExternalSorter {
     /// MergeSort in mem batches as well as spills into total order with `SortPreservingMergeStream`.
     async fn sort(&self) -> Result<SendableRecordBatchStream> {
         let partition = self.partition_id();
+        let batch_size = self.session_config.batch_size;
         let mut in_mem_batches = self.in_mem_batches.lock().await;
 
         if self.spilled_before().await {
@@ -136,6 +325,7 @@ impl ExternalSorter {
                     &mut *in_mem_batches,
                     self.schema.clone(),
                     &self.expr,
+                    batch_size,
                     tracking_metrics,
                 )?;
                 let prev_used = self.metrics.mem_used().set(0);
@@ -166,6 +356,7 @@ impl ExternalSorter {
                 &mut *in_mem_batches,
                 self.schema.clone(),
                 &self.expr,
+                batch_size,
                 tracking_metrics,
             );
             // Report to the memory manager we are no longer using memory
@@ -249,6 +440,7 @@ impl MemoryConsumer for ExternalSorter {
             &mut *in_mem_batches,
             self.schema.clone(),
             &*self.expr,
+            self.session_config.batch_size,
             tracking_metrics,
         );
 
@@ -268,36 +460,186 @@ impl MemoryConsumer for ExternalSorter {
 
 /// consume the non-empty `sorted_bathes` and do in_mem_sort
 fn in_mem_partial_sort(
-    buffered_batches: &mut Vec<RecordBatch>,
+    buffered_batches: &mut BufferedBatches,
     schema: SchemaRef,
     expressions: &[PhysicalSortExpr],
+    batch_size: usize,
     tracking_metrics: MemTrackingMetrics,
 ) -> Result<SendableRecordBatchStream> {
     assert_ne!(buffered_batches.len(), 0);
+    if buffered_batches.len() == 1 {
+        let result = buffered_batches.read_out()?;
+        Ok(Box::pin(SizedRecordBatchStream::new(
+            schema,
+            result.into_iter().map(Arc::new).collect::<Vec<_>>(),
+            tracking_metrics,
+        )))
+    } else {
+        let (sorted_arrays, batches, row_indices) = buffered_batches.read_out_all();
+        let sorted_iter = {
+            // NB timer records time taken on drop, so there are no
+            // calls to `timer.done()` below.
+            let _timer = tracking_metrics.elapsed_compute().timer();
+            get_sorted_iter(row_indices, &sorted_arrays, expressions, batch_size)?
+        };
+        Ok(Box::pin(SortedSizedRecordBatchStream::new(
+            schema,
+            batches,
+            sorted_iter,
+            tracking_metrics,
+        )))
+    }
+}
+
+#[derive(Debug, Copy, Clone)]
+struct CompositeIndex {
+    batch_idx: u32,
+    row_idx: u32,
+}
 
-    let result = {
-        // NB timer records time taken on drop, so there are no
-        // calls to `timer.done()` below.
-        let _timer = tracking_metrics.elapsed_compute().timer();
+/// Get sorted iterator by sort concatenated `SortColumn`s
+fn get_sorted_iter(
+    row_indices: Vec<CompositeIndex>,
+    sort_arrays: &[Vec<ArrayRef>],
+    expr: &[PhysicalSortExpr],
+    batch_size: usize,
+) -> Result<SortedIterator> {
+    let sort_columns = expr
+        .iter()
+        .enumerate()
+        .map(|(i, expr)| {
+            let columns_i = sort_arrays
+                .iter()
+                .map(|cs| cs[i].as_ref())
+                .collect::<Vec<&dyn Array>>();
+            Ok(SortColumn {
+                values: concat(columns_i.as_slice())?,
+                options: Some(expr.options),
+            })
+        })
+        .collect::<Result<Vec<_>>>()?;
+    let indices = lexsort_to_indices(&sort_columns, None)?;
 
-        let pre_sort = if buffered_batches.len() == 1 {
-            buffered_batches.pop()
-        } else {
-            let batches = buffered_batches.drain(..).collect::<Vec<_>>();
-            // combine all record batches into one for each column
-            common::combine_batches(&batches, schema.clone())?
-        };
+    Ok(SortedIterator::new(indices, row_indices, batch_size))
+}
 
-        pre_sort
-            .map(|batch| sort_batch(batch, schema.clone(), expressions))
-            .transpose()?
-    };
+struct SortedIterator {
+    pos: usize,
+    indices: UInt32Array,
+    composite: Vec<CompositeIndex>,
+    batch_size: usize,
+    length: usize,
+}
 
-    Ok(Box::pin(SizedRecordBatchStream::new(
-        schema,
-        vec![Arc::new(result.unwrap())],
-        tracking_metrics,
-    )))
+impl SortedIterator {
+    fn new(
+        indices: UInt32Array,
+        composite: Vec<CompositeIndex>,
+        batch_size: usize,
+    ) -> Self {
+        let length = composite.len();
+        Self {
+            pos: 0,
+            indices,
+            composite,
+            batch_size,
+            length,
+        }
+    }
+
+    fn memory_size(&self) -> usize {
+        std::mem::size_of_val(self)
+            + self.indices.get_array_memory_size()
+            + std::mem::size_of_val(&self.composite[..])
+    }
+}
+
+impl Iterator for SortedIterator {
+    type Item = Vec<CompositeIndex>;
+
+    /// Emit a max of `batch_size` positions each time
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.pos >= self.length {
+            return None;
+        }
+
+        let current_size = min(self.batch_size, self.length - self.pos);
+        let mut result = Vec::with_capacity(current_size);
+        for i in 0..current_size {
+            let p = self.pos + i;
+            let c_index = self.indices.value(p) as usize;
+            result.push(self.composite[c_index])
+        }
+        self.pos += current_size;
+        Some(result)
+    }
+}
+
+/// Stream of sorted record batches
+struct SortedSizedRecordBatchStream {
+    schema: SchemaRef,
+    batches: Vec<RowWiseBatch>,
+    sorted_iter: SortedIterator,
+    metrics: MemTrackingMetrics,
+}
+
+impl SortedSizedRecordBatchStream {
+    /// new
+    pub fn new(
+        schema: SchemaRef,
+        batches: Vec<RowWiseBatch>,
+        sorted_iter: SortedIterator,
+        metrics: MemTrackingMetrics,
+    ) -> Self {
+        let size =
+            batches.iter().map(|b| b.capacity).sum::<usize>() + sorted_iter.memory_size();
+        metrics.init_mem_used(size);
+        SortedSizedRecordBatchStream {
+            schema,
+            batches,
+            sorted_iter,
+            metrics,
+        }
+    }
+
+    fn read_as_batch(&self, combined: Vec<CompositeIndex>) -> ArrowResult<RecordBatch> {
+        let row_num = combined.len();
+        let mut output = MutableRecordBatch::new(row_num, self.schema.clone());
+        let mut row = RowReader::new(&self.schema, &[]);
+
+        for comb in combined {
+            let batch = &self.batches[comb.batch_idx as usize];
+            let offset = batch.offsets[comb.row_idx as usize] as usize;
+            row.point_to(offset, &batch.data);
+            read_row(&row, &mut output, &self.schema);
+        }
+
+        output.output()
+    }
+}
+
+impl Stream for SortedSizedRecordBatchStream {
+    type Item = ArrowResult<RecordBatch>;
+
+    fn poll_next(
+        mut self: std::pin::Pin<&mut Self>,
+        _: &mut Context<'_>,
+    ) -> Poll<Option<Self::Item>> {
+        match self.sorted_iter.next() {
+            None => Poll::Ready(None),
+            Some(combined) => {
+                let batch = self.read_as_batch(combined);
+                let poll = Poll::Ready(Some(batch));
+                self.metrics.record_poll(poll)
+            }
+        }
+    }
+}
+
+impl RecordBatchStream for SortedSizedRecordBatchStream {
+    fn schema(&self) -> SchemaRef {
+        self.schema.clone()
+    }
 }
 
 async fn spill_partial_sorted_stream(
@@ -533,22 +875,26 @@ impl ExecutionPlan for SortExec {
     }
 }
 
+struct BatchWithSortArray {
+    sort_arrays: Vec<ArrayRef>,
+    batch: RecordBatch,
+}
+
 fn sort_batch(
     batch: RecordBatch,
     schema: SchemaRef,
     expr: &[PhysicalSortExpr],
-) -> ArrowResult<RecordBatch> {
+) -> ArrowResult<BatchWithSortArray> {
     // TODO: pushup the limit expression to sort
-    let indices = lexsort_to_indices(
-        &expr
-            .iter()
-            .map(|e| e.evaluate_to_sort_column(&batch))
-            .collect::<Result<Vec<SortColumn>>>()?,
-        None,
-    )?;
+    let sort_columns = expr
+        .iter()
+        .map(|e| e.evaluate_to_sort_column(&batch))
+        .collect::<Result<Vec<SortColumn>>>()?;
+
+    let indices = lexsort_to_indices(&sort_columns, None)?;
 
     // reorder all rows based on sorted indices
-    RecordBatch::try_new(
+    let sorted_batch = RecordBatch::try_new(
         schema,
         batch
             .columns()
@@ -565,7 +911,25 @@ fn sort_batch(
                 )
             })
             .collect::<ArrowResult<Vec<ArrayRef>>>()?,
-    )
+    )?;
+
+    let sort_arrays = sort_columns
+        .into_iter()
+        .map(|sc| {
+            Ok(take(
+                sc.values.as_ref(),
+                &indices,
+                Some(TakeOptions {
+                    check_bounds: false,
+                }),
+            )?)
+        })
+        .collect::<Result<Vec<_>>>()?;
+
+    Ok(BatchWithSortArray {
+        sort_arrays,
+        batch: sorted_batch,
+    })
 }
 
 async fn do_sort(
@@ -576,6 +940,8 @@ async fn do_sort(
     context: Arc<TaskContext>,
 ) -> Result<SendableRecordBatchStream> {
     let schema = input.schema();
+    let tracking_metrics =
+        metrics_set.new_intermediate_tracking(partition_id, context.runtime_env());
     let sorter = ExternalSorter::new(
         partition_id,
         schema.clone(),
@@ -587,7 +953,7 @@ async fn do_sort(
     context.runtime_env().register_requester(sorter.id());
     while let Some(batch) = input.next().await {
         let batch = batch?;
-        sorter.insert_batch(batch).await?;
+        sorter.insert_batch(batch, &tracking_metrics).await?;
     }
     sorter.sort().await
 }
@@ -682,6 +1048,7 @@ mod tests {
     }
 
     #[tokio::test]
+    #[ignore]
     async fn test_sort_spill() -> Result<()> {
         // trigger spill there will be 4 batches with 5.5KB for each
         let config = RuntimeConfig::new().with_memory_limit(12288, 1.0);
diff --git a/datafusion/core/src/row/mod.rs b/datafusion/core/src/row/mod.rs
index 531dbfe3e41e..e9bb8b66fb75 100644
--- a/datafusion/core/src/row/mod.rs
+++ b/datafusion/core/src/row/mod.rs
@@ -48,7 +48,7 @@
 //!
 
 use arrow::datatypes::{DataType, Schema};
-use arrow::util::bit_util::{get_bit_raw, round_upto_power_of_2};
+use arrow::util::bit_util::{ceil, get_bit_raw, round_upto_power_of_2};
 use std::fmt::Write;
 use std::sync::Arc;
 
@@ -165,8 +165,11 @@ fn type_width(dt: &DataType) -> usize {
     }
 }
 
-fn estimate_row_width(null_width: usize, schema: &Arc<Schema>) -> usize {
-    let mut width = null_width;
+/// Estimate row width based on schema
+pub fn estimate_row_width(schema: &Arc<Schema>) -> usize {
+    let null_free = schema_null_free(schema);
+    let field_count = schema.fields().len();
+    let mut width = if null_free { 0 } else { ceil(field_count, 8) };
     for f in schema.fields() {
         width += type_width(f.data_type());
         match f.data_type() {
@@ -178,11 +181,14 @@ fn estimate_row_width(null_width: usize, schema: &Arc<Schema>) -> usize {
     round_upto_power_of_2(width, 8)
 }
 
-fn fixed_size(schema: &Arc<Schema>) -> bool {
+/// Tell if the row is of fixed size
+pub fn fixed_size(schema: &Arc<Schema>) -> bool {
     schema.fields().iter().all(|f| !var_length(f.data_type()))
 }
 
-fn supported(schema: &Arc<Schema>) -> bool {
+/// Tell if we can create raw-bytes based rows since we currently
+/// has limited data type supports in the row format
+pub fn row_supported(schema: &Arc<Schema>) -> bool {
     schema
         .fields()
         .iter()
@@ -222,14 +228,11 @@ mod tests {
     use super::*;
     use crate::datasource::file_format::parquet::ParquetFormat;
     use crate::datasource::file_format::FileFormat;
-    use crate::datasource::object_store::local::{
-        local_object_reader, local_object_reader_stream, local_unpartitioned_file,
-        LocalFileSystem,
-    };
+    use crate::datasource::listing::local_unpartitioned_file;
     use crate::error::Result;
-    use crate::execution::runtime_env::RuntimeEnv;
     use crate::physical_plan::file_format::FileScanConfig;
     use crate::physical_plan::{collect, ExecutionPlan};
+    use crate::prelude::SessionContext;
     use crate::row::reader::read_as_batch;
     #[cfg(feature = "jit")]
     use crate::row::reader::read_as_batch_jit;
@@ -239,6 +242,10 @@ mod tests {
     use arrow::record_batch::RecordBatch;
     use arrow::util::bit_util::{ceil, set_bit_raw, unset_bit_raw};
     use arrow::{array::*, datatypes::*};
+    use datafusion_data_access::object_store::local::LocalFileSystem;
+    use datafusion_data_access::object_store::local::{
+        local_object_reader, local_object_reader_stream,
+    };
     #[cfg(feature = "jit")]
     use datafusion_jit::api::Assembler;
     use rand::Rng;
@@ -334,7 +341,7 @@ mod tests {
                     let mut vector = vec![0; 1024];
                     let row_offsets =
                         { write_batch_unchecked(&mut vector, 0, &batch, 0, schema.clone()) };
-                    let output_batch = { read_as_batch(&vector, schema, row_offsets)? };
+                    let output_batch = { read_as_batch(&vector, schema, &row_offsets)? };
                     assert_eq!(batch, output_batch);
                     Ok(())
                 }
@@ -350,7 +357,7 @@ mod tests {
                     let assembler = Assembler::default();
                     let row_offsets =
                         { write_batch_unchecked_jit(&mut vector, 0, &batch, 0, schema.clone(), &assembler)? };
-                    let output_batch = { read_as_batch_jit(&vector, schema, row_offsets, &assembler)? };
+                    let output_batch = { read_as_batch_jit(&vector, schema, &row_offsets, &assembler)? };
                     assert_eq!(batch, output_batch);
                     Ok(())
                 }
@@ -365,7 +372,7 @@ mod tests {
                     let mut vector = vec![0; 1024];
                     let row_offsets =
                         { write_batch_unchecked(&mut vector, 0, &batch, 0, schema.clone()) };
-                    let output_batch = { read_as_batch(&vector, schema, row_offsets)? };
+                    let output_batch = { read_as_batch(&vector, schema, &row_offsets)? };
                     assert_eq!(batch, output_batch);
                     Ok(())
                 }
@@ -382,7 +389,7 @@ mod tests {
                     let assembler = Assembler::default();
                     let row_offsets =
                         { write_batch_unchecked_jit(&mut vector, 0, &batch, 0, schema.clone(), &assembler)? };
-                    let output_batch = { read_as_batch_jit(&vector, schema, row_offsets, &assembler)? };
+                    let output_batch = { read_as_batch_jit(&vector, schema, &row_offsets, &assembler)? };
                     assert_eq!(batch, output_batch);
                     Ok(())
                 }
@@ -484,7 +491,7 @@ mod tests {
         let mut vector = vec![0; 8192];
         let row_offsets =
             { write_batch_unchecked(&mut vector, 0, &batch, 0, schema.clone()) };
-        let output_batch = { read_as_batch(&vector, schema, row_offsets)? };
+        let output_batch = { read_as_batch(&vector, schema, &row_offsets)? };
         assert_eq!(batch, output_batch);
         Ok(())
     }
@@ -510,7 +517,7 @@ mod tests {
             )?
         };
         let output_batch =
-            { read_as_batch_jit(&vector, schema, row_offsets, &assembler)? };
+            { read_as_batch_jit(&vector, schema, &row_offsets, &assembler)? };
         assert_eq!(batch, output_batch);
         Ok(())
     }
@@ -524,7 +531,7 @@ mod tests {
         let mut vector = vec![0; 8192];
         let row_offsets =
             { write_batch_unchecked(&mut vector, 0, &batch, 0, schema.clone()) };
-        let output_batch = { read_as_batch(&vector, schema, row_offsets)? };
+        let output_batch = { read_as_batch(&vector, schema, &row_offsets)? };
         assert_eq!(batch, output_batch);
         Ok(())
     }
@@ -549,26 +556,27 @@ mod tests {
             )?
         };
         let output_batch =
-            { read_as_batch_jit(&vector, schema, row_offsets, &assembler)? };
+            { read_as_batch_jit(&vector, schema, &row_offsets, &assembler)? };
         assert_eq!(batch, output_batch);
         Ok(())
     }
 
     #[tokio::test]
     async fn test_with_parquet() -> Result<()> {
-        let runtime = Arc::new(RuntimeEnv::default());
+        let session_ctx = SessionContext::new();
+        let task_ctx = session_ctx.task_ctx();
         let projection = Some(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]);
         let exec = get_exec("alltypes_plain.parquet", &projection, None).await?;
         let schema = exec.schema().clone();
 
-        let batches = collect(exec, runtime).await?;
+        let batches = collect(exec, task_ctx).await?;
         assert_eq!(1, batches.len());
         let batch = &batches[0];
 
         let mut vector = vec![0; 20480];
         let row_offsets =
             { write_batch_unchecked(&mut vector, 0, batch, 0, schema.clone()) };
-        let output_batch = { read_as_batch(&vector, schema, row_offsets)? };
+        let output_batch = { read_as_batch(&vector, schema, &row_offsets)? };
         assert_eq!(*batch, output_batch);
 
         Ok(())
@@ -594,7 +602,7 @@ mod tests {
         )]));
         let vector = vec![0; 1024];
         let row_offsets = vec![0];
-        read_as_batch(&vector, schema, row_offsets).unwrap();
+        read_as_batch(&vector, schema, &row_offsets).unwrap();
     }
 
     async fn get_exec(
diff --git a/datafusion/core/src/row/reader.rs b/datafusion/core/src/row/reader.rs
index 3e2c45363987..e1ee0a346e93 100644
--- a/datafusion/core/src/row/reader.rs
+++ b/datafusion/core/src/row/reader.rs
@@ -23,7 +23,7 @@ use crate::reg_fn;
 #[cfg(feature = "jit")]
 use crate::row::fn_name;
 use crate::row::{
-    all_valid, get_offsets, schema_null_free, supported, NullBitsFormatter,
+    all_valid, get_offsets, row_supported, schema_null_free, NullBitsFormatter,
 };
 use arrow::array::*;
 use arrow::datatypes::{DataType, Schema};
@@ -42,14 +42,14 @@ use std::sync::Arc;
 pub fn read_as_batch(
     data: &[u8],
     schema: Arc<Schema>,
-    offsets: Vec<usize>,
+    offsets: &[usize],
 ) -> Result<RecordBatch> {
     let row_num = offsets.len();
     let mut output = MutableRecordBatch::new(row_num, schema.clone());
     let mut row = RowReader::new(&schema, data);
 
     for offset in offsets.iter().take(row_num) {
-        row.point_to(*offset);
+        row.point_to(*offset, data);
         read_row(&row, &mut output, &schema);
     }
 
@@ -61,7 +61,7 @@ pub fn read_as_batch(
 pub fn read_as_batch_jit(
     data: &[u8],
     schema: Arc<Schema>,
-    offsets: Vec<usize>,
+    offsets: &[usize],
     assembler: &Assembler,
 ) -> Result<RecordBatch> {
     let row_num = offsets.len();
@@ -76,7 +76,7 @@ pub fn read_as_batch_jit(
     };
 
     for offset in offsets.iter().take(row_num) {
-        row.point_to(*offset);
+        row.point_to(*offset, data);
         code_fn(&row, &mut output);
     }
 
@@ -157,7 +157,7 @@ impl<'a> std::fmt::Debug for RowReader<'a> {
 impl<'a> RowReader<'a> {
     /// new
     pub fn new(schema: &Arc<Schema>, data: &'a [u8]) -> Self {
-        assert!(supported(schema));
+        assert!(row_supported(schema));
         let null_free = schema_null_free(schema);
         let field_count = schema.fields().len();
         let null_width = if null_free { 0 } else { ceil(field_count, 8) };
@@ -173,8 +173,9 @@ impl<'a> RowReader<'a> {
     }
 
     /// Update this row to point to position `offset` in `base`
-    pub fn point_to(&mut self, offset: usize) {
+    pub fn point_to(&mut self, offset: usize, data: &'a [u8]) {
         self.base_offset = offset;
+        self.data = data;
     }
 
     #[inline]
@@ -244,7 +245,7 @@ impl<'a> RowReader<'a> {
         let len = (offset_size & 0xffff_ffff) as usize;
         let varlena_offset = self.base_offset + offset;
         let bytes = &self.data[varlena_offset..varlena_offset + len];
-        std::str::from_utf8(bytes).unwrap()
+        unsafe { std::str::from_utf8_unchecked(bytes) }
     }
 
     fn get_binary(&self, idx: usize) -> &[u8] {
@@ -293,7 +294,8 @@ impl<'a> RowReader<'a> {
     }
 }
 
-fn read_row(row: &RowReader, batch: &mut MutableRecordBatch, schema: &Arc<Schema>) {
+/// Read the row currently pointed by RowWriter to the output columnar batch buffer
+pub fn read_row(row: &RowReader, batch: &mut MutableRecordBatch, schema: &Arc<Schema>) {
     if row.null_free || row.all_valid() {
         for ((col_idx, to), field) in batch
             .arrays
@@ -538,18 +540,21 @@ fn read_field_null_free(
     }
 }
 
-struct MutableRecordBatch {
+/// Columnar Batch buffer
+pub struct MutableRecordBatch {
     arrays: Vec<Box<dyn ArrayBuilder>>,
     schema: Arc<Schema>,
 }
 
 impl MutableRecordBatch {
-    fn new(target_batch_size: usize, schema: Arc<Schema>) -> Self {
+    /// new
+    pub fn new(target_batch_size: usize, schema: Arc<Schema>) -> Self {
         let arrays = new_arrays(&schema, target_batch_size);
         Self { arrays, schema }
     }
 
-    fn output(&mut self) -> ArrowResult<RecordBatch> {
+    /// Finalize the batch, output and reset this buffer
+    pub fn output(&mut self) -> ArrowResult<RecordBatch> {
         let result = make_batch(self.schema.clone(), self.arrays.drain(..).collect());
         result
     }
diff --git a/datafusion/core/src/row/writer.rs b/datafusion/core/src/row/writer.rs
index 9923ebfb5105..b0e791620dde 100644
--- a/datafusion/core/src/row/writer.rs
+++ b/datafusion/core/src/row/writer.rs
@@ -24,7 +24,7 @@ use crate::reg_fn;
 #[cfg(feature = "jit")]
 use crate::row::fn_name;
 use crate::row::{
-    estimate_row_width, fixed_size, get_offsets, schema_null_free, supported,
+    estimate_row_width, fixed_size, get_offsets, row_supported, schema_null_free,
 };
 use arrow::array::*;
 use arrow::datatypes::{DataType, Schema};
@@ -203,12 +203,12 @@ pub struct RowWriter {
 impl RowWriter {
     /// new
     pub fn new(schema: &Arc<Schema>) -> Self {
-        assert!(supported(schema));
+        assert!(row_supported(schema));
         let null_free = schema_null_free(schema);
         let field_count = schema.fields().len();
         let null_width = if null_free { 0 } else { ceil(field_count, 8) };
         let (field_offsets, values_width) = get_offsets(null_width, schema);
-        let mut init_capacity = estimate_row_width(null_width, schema);
+        let mut init_capacity = estimate_row_width(schema);
         if !fixed_size(schema) {
             // double the capacity to avoid repeated resize
             init_capacity *= 2;
@@ -335,13 +335,14 @@ impl RowWriter {
         }
     }
 
-    fn get_row(&self) -> &[u8] {
+    /// Get raw bytes
+    pub fn get_row(&self) -> &[u8] {
         &self.data[0..self.row_width]
     }
 }
 
 /// Stitch attributes of tuple in `batch` at `row_idx` and returns the tuple width
-fn write_row(
+pub fn write_row(
     row: &mut RowWriter,
     row_idx: usize,
     schema: &Arc<Schema>,
diff --git a/datafusion/core/tests/order_spill_fuzz.rs b/datafusion/core/tests/order_spill_fuzz.rs
index c052382d5eac..f7a731dedd66 100644
--- a/datafusion/core/tests/order_spill_fuzz.rs
+++ b/datafusion/core/tests/order_spill_fuzz.rs
@@ -35,11 +35,13 @@ use rand::{Rng, SeedableRng};
 use std::sync::Arc;
 
 #[tokio::test]
+#[ignore]
 async fn test_sort_1k_mem() {
     run_sort(1024, vec![(5, false), (2000, true), (1000000, true)]).await
 }
 
 #[tokio::test]
+#[ignore]
 async fn test_sort_100k_mem() {
     run_sort(102400, vec![(5, false), (2000, false), (1000000, true)]).await
 }