// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

//! A comparable row-oriented representation of a collection of [`Array`]

use std::cmp::Ordering;
use std::hash::{Hash, Hasher};
use std::sync::Arc;

use arrow_array::cast::*;
use arrow_array::*;

use crate::compute::SortOptions;
use crate::datatypes::*;
use crate::error::{ArrowError, Result};
use crate::row::dictionary::{
    compute_dictionary_mapping, decode_dictionary, encode_dictionary,
};
use crate::row::fixed::{
    decode_bool, decode_decimal, decode_primitive, RawDecimal, RawDecimal128,
    RawDecimal256,
};
use crate::row::interner::OrderPreservingInterner;
use crate::row::variable::{decode_binary, decode_string};
use crate::{downcast_dictionary_array, downcast_primitive_array};

mod dictionary;
mod fixed;
mod interner;
mod variable;

/// Converts [`ArrayRef`] columns into a row-oriented format that are [normalized for sorting].
///
/// In particular, a byte-wise comparison of the rows, e.g. [`memcmp`], is sufficient
/// to establish the ordering of two rows, allowing for extremely fast comparisons,
/// and permitting the use of [non-comparison sorts] such as [radix sort]
///
/// Comparing [`Rows`] generated by different [`RowConverter`] is not guaranteed to
/// yield a meaningful ordering
///
/// # Format
///
/// The encoding of the row format should not be considered stable, but is documented here
/// for reference.
///
/// ## Unsigned Integer Encoding
///
/// A null integer is encoded as a `0_u8`, followed by a zero-ed number of bytes corresponding
/// to the integer's length
///
/// A valid integer is encoded as `1_u8`, followed by the big-endian representation of the
/// integer
///
/// ## Signed Integer Encoding
///
/// Signed integers have their most significant sign bit flipped, and are then encoded in the
/// same manner as an unsigned integer
///
/// ## Float Encoding
///
/// Floats are converted from IEEE 754 representation to a signed integer representation
/// by flipping all bar the sign bit if they are negative.
///
/// They are then encoded in the same manner as a signed integer
///
/// ## Variable Length Bytes Encoding
///
/// A null is encoded as a `0_u8`
///
/// An empty byte array is encoded as `1_u8`
///
/// A non-null, non-empty byte array is encoded as `2_u8` followed by the byte array
/// encoded using a block based scheme described below.
///
/// The byte array is broken up into 32-byte blocks, each block is written in turn
/// to the output, followed by `0xFF_u8`. The final block is padded to 32-bytes
/// with `0_u8` and written to the output, followed by the un-padded length in bytes
/// of this final block as a `u8`
///
/// This is loosely inspired by [COBS] encoding, and chosen over more traditional
/// [byte stuffing] as it is more amenable to vectorisation, in particular AVX-256.
///
/// ## Dictionary Encoding
///
/// [`RowConverter`] needs to support converting dictionary encoded arrays with unsorted, and
/// potentially distinct dictionaries. One simple mechanism to avoid this would be to reverse
/// the dictionary encoding, and encode the array values directly, however, this would lose
/// the benefits of dictionary encoding to reduce memory and CPU consumption.
///
/// As such the [`RowConverter`] maintains an order-preserving dictionary encoding for each
/// dictionary encoded column. As this is a variable-length encoding, new dictionary values
/// can be added whilst preserving the sort order.
///
/// A null dictionary value is encoded as `0_u8`.
///
/// A non-null dictionary value is encoded as `1_u8` followed by a null-terminated byte array
/// key determined by the order-preserving dictionary encoding
///
/// # Ordering
///
/// ## Float Ordering
///
/// Floats are totally ordered in accordance to the `totalOrder` predicate as defined
/// in the IEEE 754 (2008 revision) floating point standard.
///
/// The ordering established by this does not always agree with the
/// [`PartialOrd`] and [`PartialEq`] implementations of `f32`. For example,
/// they consider negative and positive zero equal, while this does not
///
/// ## Null Ordering
///
/// The encoding described above will order nulls first, this can be inverted by representing
/// nulls as `0xFF_u8` instead of `0_u8`
///
/// ## Reverse Column Ordering
///
/// The order of a given column can be reversed by negating the encoded bytes of non-null values
///
/// ## Reconstruction
///
/// Given a schema it would theoretically be possible to reconstruct the columnar data from
/// the row format, however, this is currently not implemented. It is recommended that the row
/// format is instead used to obtain a sorted list of row indices, which can then be used
/// with [`take`](crate::compute::take) to obtain a sorted [`Array`]
///
/// [non-comparison sorts]:[https://en.wikipedia.org/wiki/Sorting_algorithm#Non-comparison_sorts]
/// [radix sort]:[https://en.wikipedia.org/wiki/Radix_sort]
/// [normalized for sorting]:[https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.83.1080&rep=rep1&type=pdf]
/// [`memcmp`]:[https://www.man7.org/linux/man-pages/man3/memcmp.3.html]
/// [COBS]:[https://en.wikipedia.org/wiki/Consistent_Overhead_Byte_Stuffing]
/// [byte stuffing]:[https://en.wikipedia.org/wiki/High-Level_Data_Link_Control#Asynchronous_framing]
#[derive(Debug)]
pub struct RowConverter {
    fields: Arc<[SortField]>,
    /// interning state for column `i`, if column`i` is a dictionary
    interners: Vec<Option<Box<OrderPreservingInterner>>>,
}

/// Configure the data type and sort order for a given column
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct SortField {
    /// Sort options
    options: SortOptions,
    /// Data type
    data_type: DataType,
}

impl SortField {
    /// Create a new column with the given data type
    pub fn new(data_type: DataType) -> Self {
        Self::new_with_options(data_type, Default::default())
    }

    /// Create a new column with the given data type and [`SortOptions`]
    pub fn new_with_options(data_type: DataType, options: SortOptions) -> Self {
        Self { options, data_type }
    }
}

impl RowConverter {
    /// Create a new [`RowConverter`] with the provided schema
    pub fn new(fields: Vec<SortField>) -> Self {
        let interners = (0..fields.len()).map(|_| None).collect();
        Self {
            fields: fields.into(),
            interners,
        }
    }

    /// Convert [`ArrayRef`] columns into [`Rows`]
    ///
    /// See [`Row`] for information on when [`Row`] can be compared
    ///
    /// # Panics
    ///
    /// Panics if the schema of `columns` does not match that provided to [`RowConverter::new`]
    pub fn convert_columns(&mut self, columns: &[ArrayRef]) -> Result<Rows> {
        if columns.len() != self.fields.len() {
            return Err(ArrowError::InvalidArgumentError(format!(
                "Incorrect number of arrays provided to RowConverter, expected {} got {}",
                self.fields.len(),
                columns.len()
            )));
        }

        let dictionaries = columns
            .iter()
            .zip(&mut self.interners)
            .zip(self.fields.iter())
            .map(|((column, interner), field)| {
                if !column.data_type().equals_datatype(&field.data_type) {
                    return Err(ArrowError::InvalidArgumentError(format!(
                        "RowConverter column schema mismatch, expected {} got {}",
                        field.data_type,
                        column.data_type()
                    )));
                }

                let values = downcast_dictionary_array! {
                    column => column.values(),
                    _ => return Ok(None)
                };

                let interner = interner.get_or_insert_with(Default::default);

                let mapping: Vec<_> = compute_dictionary_mapping(interner, values)?
                    .into_iter()
                    .map(|maybe_interned| {
                        maybe_interned.map(|interned| interner.normalized_key(interned))
                    })
                    .collect();

                Ok(Some(mapping))
            })
            .collect::<Result<Vec<_>>>()?;

        let mut rows = new_empty_rows(columns, &dictionaries, Arc::clone(&self.fields))?;

        for ((column, field), dictionary) in
            columns.iter().zip(self.fields.iter()).zip(dictionaries)
        {
            // We encode a column at a time to minimise dispatch overheads
            encode_column(&mut rows, column, field.options, dictionary.as_deref())
        }

        if cfg!(debug_assertions) {
            assert_eq!(*rows.offsets.last().unwrap(), rows.buffer.len());
            rows.offsets
                .windows(2)
                .for_each(|w| assert!(w[0] <= w[1], "offsets should be monotonic"));
        }

        Ok(rows)
    }

    /// Convert [`Rows`] columns into [`ArrayRef`]
    ///
    /// # Panics
    ///
    /// Panics if the rows were not produced by this [`RowConverter`]
    pub fn convert_rows<'a, I>(&self, rows: I) -> Result<Vec<ArrayRef>>
    where
        I: IntoIterator<Item = Row<'a>>,
    {
        let mut rows: Vec<_> = rows
            .into_iter()
            .map(|row| {
                assert!(
                    Arc::ptr_eq(row.fields, &self.fields),
                    "rows were not produced by this RowConverter"
                );

                row.data
            })
            .collect();

        self.fields
            .iter()
            .zip(&self.interners)
            .map(|(field, interner)| {
                // SAFETY
                // We have validated that the rows came from this [`RowConverter`]
                // and therefore must be valid
                unsafe { decode_column(field, &mut rows, interner.as_deref()) }
            })
            .collect()
    }
}

/// A row-oriented representation of arrow data, that is normalized for comparison
///
/// See [`RowConverter`]
#[derive(Debug)]
pub struct Rows {
    /// Underlying row bytes
    buffer: Box<[u8]>,
    /// Row `i` has data `&buffer[offsets[i]..offsets[i+1]]`
    offsets: Box<[usize]>,
    /// The schema for these rows
    fields: Arc<[SortField]>,
}

impl Rows {
    pub fn row(&self, row: usize) -> Row<'_> {
        let end = self.offsets[row + 1];
        let start = self.offsets[row];
        Row {
            data: &self.buffer[start..end],
            fields: &self.fields,
        }
    }

    pub fn num_rows(&self) -> usize {
        self.offsets.len() - 1
    }
}

impl<'a> IntoIterator for &'a Rows {
    type Item = Row<'a>;
    type IntoIter = RowsIter<'a>;

    fn into_iter(self) -> Self::IntoIter {
        RowsIter {
            rows: self,
            start: 0,
            end: self.num_rows(),
        }
    }
}

/// An iterator over [`Rows`]
#[derive(Debug)]
pub struct RowsIter<'a> {
    rows: &'a Rows,
    start: usize,
    end: usize,
}

impl<'a> Iterator for RowsIter<'a> {
    type Item = Row<'a>;

    fn next(&mut self) -> Option<Self::Item> {
        if self.end == self.start {
            return None;
        }
        let row = self.rows.row(self.start);
        self.start += 1;
        Some(row)
    }

    fn size_hint(&self) -> (usize, Option<usize>) {
        let len = self.len();
        (len, Some(len))
    }
}

impl<'a> ExactSizeIterator for RowsIter<'a> {
    fn len(&self) -> usize {
        self.end - self.start
    }
}

impl<'a> DoubleEndedIterator for RowsIter<'a> {
    fn next_back(&mut self) -> Option<Self::Item> {
        if self.end == self.start {
            return None;
        }
        let row = self.rows.row(self.end);
        self.end -= 1;
        Some(row)
    }
}

/// A comparable representation of a row
///
/// Two [`Row`] can be compared if they both belong to [`Rows`] returned by calls to
/// [`RowConverter::convert_columns`] on the same [`RowConverter`]
///
/// Otherwise any ordering established by comparing the [`Row`] is arbitrary
#[derive(Debug, Copy, Clone)]
pub struct Row<'a> {
    data: &'a [u8],
    fields: &'a Arc<[SortField]>,
}

// Manually derive these as don't wish to include `fields`

impl<'a> PartialEq for Row<'a> {
    #[inline]
    fn eq(&self, other: &Self) -> bool {
        self.data.eq(other.data)
    }
}

impl<'a> Eq for Row<'a> {}

impl<'a> PartialOrd for Row<'a> {
    #[inline]
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
        self.data.partial_cmp(other.data)
    }
}

impl<'a> Ord for Row<'a> {
    #[inline]
    fn cmp(&self, other: &Self) -> Ordering {
        self.data.cmp(other.data)
    }
}

impl<'a> Hash for Row<'a> {
    #[inline]
    fn hash<H: Hasher>(&self, state: &mut H) {
        self.data.hash(state)
    }
}

impl<'a> AsRef<[u8]> for Row<'a> {
    #[inline]
    fn as_ref(&self) -> &[u8] {
        self.data
    }
}

/// Returns the null sentinel, negated if `invert` is true
#[inline]
fn null_sentinel(options: SortOptions) -> u8 {
    match options.nulls_first {
        true => 0,
        false => 0xFF,
    }
}

/// Computes the length of each encoded [`Rows`] and returns an empty [`Rows`]
fn new_empty_rows(
    cols: &[ArrayRef],
    dictionaries: &[Option<Vec<Option<&[u8]>>>],
    fields: Arc<[SortField]>,
) -> Result<Rows> {
    use fixed::FixedLengthEncoding;

    let num_rows = cols.first().map(|x| x.len()).unwrap_or(0);
    let mut lengths = vec![0; num_rows];

    for (array, dict) in cols.iter().zip(dictionaries) {
        downcast_primitive_array! {
            array => lengths.iter_mut().for_each(|x| *x += fixed::encoded_len(array)),
            DataType::Null => {},
            DataType::Boolean => lengths.iter_mut().for_each(|x| *x += bool::ENCODED_LEN),
            DataType::Decimal128(_, _) => lengths.iter_mut().for_each(|x| *x += RawDecimal128::ENCODED_LEN),
            DataType::Decimal256(_, _) => lengths.iter_mut().for_each(|x| *x += RawDecimal256::ENCODED_LEN),
            DataType::Binary => as_generic_binary_array::<i32>(array)
                .iter()
                .zip(lengths.iter_mut())
                .for_each(|(slice, length)| *length += variable::encoded_len(slice)),
            DataType::LargeBinary => as_generic_binary_array::<i64>(array)
                .iter()
                .zip(lengths.iter_mut())
                .for_each(|(slice, length)| *length += variable::encoded_len(slice)),
            DataType::Utf8 => as_string_array(array)
                .iter()
                .zip(lengths.iter_mut())
                .for_each(|(slice, length)| {
                    *length += variable::encoded_len(slice.map(|x| x.as_bytes()))
                }),
            DataType::LargeUtf8 => as_largestring_array(array)
                .iter()
                .zip(lengths.iter_mut())
                .for_each(|(slice, length)| {
                    *length += variable::encoded_len(slice.map(|x| x.as_bytes()))
                }),
            DataType::Dictionary(_, _) => downcast_dictionary_array! {
                array => {
                    let dict = dict.as_ref().unwrap();
                    for (v, length) in array.keys().iter().zip(lengths.iter_mut()) {
                        match v.and_then(|v| dict[v as usize]) {
                            Some(k) => *length += k.len() + 1,
                            None => *length += 1,
                        }
                    }
                }
                _ => unreachable!(),
            }
            t => return Err(ArrowError::NotYetImplemented(format!("not yet implemented: {}", t)))
        }
    }

    let mut offsets = Vec::with_capacity(num_rows + 1);
    offsets.push(0);

    // We initialize the offsets shifted down by one row index.
    //
    // As the rows are appended to the offsets will be incremented to match
    //
    // For example, consider the case of 3 rows of length 3, 4, and 6 respectively.
    // The offsets would be initialized to `0, 0, 3, 7`
    //
    // Writing the first row entirely would yield `0, 3, 3, 7`
    // The second, `0, 3, 7, 7`
    // The third, `0, 3, 7, 13`
    //
    // This would be the final offsets for reading
    //
    // In this way offsets tracks the position during writing whilst eventually serving
    // as identifying the offsets of the written rows
    let mut cur_offset = 0_usize;
    for l in lengths {
        offsets.push(cur_offset);
        cur_offset = cur_offset.checked_add(l).expect("overflow");
    }

    let buffer = vec![0_u8; cur_offset];

    Ok(Rows {
        buffer: buffer.into(),
        offsets: offsets.into(),
        fields,
    })
}

/// Encodes a column to the provided [`Rows`] incrementing the offsets as it progresses
fn encode_column(
    out: &mut Rows,
    column: &ArrayRef,
    opts: SortOptions,
    dictionary: Option<&[Option<&[u8]>]>,
) {
    downcast_primitive_array! {
        column => fixed::encode(out, column, opts),
        DataType::Null => {}
        DataType::Boolean => fixed::encode(out, as_boolean_array(column), opts),
        DataType::Decimal128(_, _) => {
            let iter = column
                .as_any()
                .downcast_ref::<Decimal128Array>()
                .unwrap()
                .into_iter()
                .map(|x| x.map(|x| RawDecimal(x.to_le_bytes())));

            fixed::encode(out, iter, opts)
        },
        DataType::Decimal256(_, _) => {
            let iter = column
                .as_any()
                .downcast_ref::<Decimal256Array>()
                .unwrap()
                .into_iter()
                .map(|x| x.map(|x| RawDecimal(x.to_le_bytes())));

            fixed::encode(out, iter, opts)
        },
        DataType::Binary => {
            variable::encode(out, as_generic_binary_array::<i32>(column).iter(), opts)
        }
        DataType::LargeBinary => {
            variable::encode(out, as_generic_binary_array::<i64>(column).iter(), opts)
        }
        DataType::Utf8 => variable::encode(
            out,
            as_string_array(column).iter().map(|x| x.map(|x| x.as_bytes())),
            opts,
        ),
        DataType::LargeUtf8 => variable::encode(
            out,
            as_largestring_array(column)
                .iter()
                .map(|x| x.map(|x| x.as_bytes())),
            opts,
        ),
        DataType::Dictionary(_, _) => downcast_dictionary_array! {
            column => encode_dictionary(out, column, dictionary.unwrap(), opts),
            _ => unreachable!()
        }
        t => unimplemented!("not yet implemented: {}", t)
    }
}

/// Decodes a the provided `field` from `rows`
///
/// # Safety
///
/// Rows must contain valid data for the provided field
unsafe fn decode_column(
    field: &SortField,
    rows: &mut [&[u8]],
    interner: Option<&OrderPreservingInterner>,
) -> Result<ArrayRef> {
    let options = field.options;
    let array: ArrayRef = match &field.data_type {
        DataType::Null => Arc::new(NullArray::new(rows.len())),
        DataType::Boolean => Arc::new(decode_bool(rows, options)),
        DataType::Int8 => Arc::new(decode_primitive::<Int8Type>(rows, options)),
        DataType::Int16 => Arc::new(decode_primitive::<Int16Type>(rows, options)),
        DataType::Int32 => Arc::new(decode_primitive::<Int32Type>(rows, options)),
        DataType::Int64 => Arc::new(decode_primitive::<Int64Type>(rows, options)),
        DataType::UInt8 => Arc::new(decode_primitive::<UInt8Type>(rows, options)),
        DataType::UInt16 => Arc::new(decode_primitive::<UInt16Type>(rows, options)),
        DataType::UInt32 => Arc::new(decode_primitive::<UInt32Type>(rows, options)),
        DataType::UInt64 => Arc::new(decode_primitive::<UInt64Type>(rows, options)),
        DataType::Float16 => Arc::new(decode_primitive::<Float16Type>(rows, options)),
        DataType::Float32 => Arc::new(decode_primitive::<Float32Type>(rows, options)),
        DataType::Float64 => Arc::new(decode_primitive::<Float64Type>(rows, options)),
        DataType::Timestamp(TimeUnit::Second, _) => {
            Arc::new(decode_primitive::<TimestampSecondType>(rows, options))
        }
        DataType::Timestamp(TimeUnit::Millisecond, _) => {
            Arc::new(decode_primitive::<TimestampMillisecondType>(rows, options))
        }
        DataType::Timestamp(TimeUnit::Microsecond, _) => {
            Arc::new(decode_primitive::<TimestampMicrosecondType>(rows, options))
        }
        DataType::Timestamp(TimeUnit::Nanosecond, _) => {
            Arc::new(decode_primitive::<TimestampNanosecondType>(rows, options))
        }
        DataType::Date32 => Arc::new(decode_primitive::<Date32Type>(rows, options)),
        DataType::Date64 => Arc::new(decode_primitive::<Date64Type>(rows, options)),
        DataType::Time32(t) => match t {
            TimeUnit::Second => {
                Arc::new(decode_primitive::<Time32SecondType>(rows, options))
            }
            TimeUnit::Millisecond => {
                Arc::new(decode_primitive::<Time32MillisecondType>(rows, options))
            }
            _ => unreachable!(),
        },
        DataType::Time64(t) => match t {
            TimeUnit::Microsecond => {
                Arc::new(decode_primitive::<Time64MicrosecondType>(rows, options))
            }
            TimeUnit::Nanosecond => {
                Arc::new(decode_primitive::<Time64NanosecondType>(rows, options))
            }
            _ => unreachable!(),
        },
        DataType::Duration(TimeUnit::Second) => {
            Arc::new(decode_primitive::<DurationSecondType>(rows, options))
        }
        DataType::Duration(TimeUnit::Millisecond) => {
            Arc::new(decode_primitive::<DurationMillisecondType>(rows, options))
        }
        DataType::Duration(TimeUnit::Microsecond) => {
            Arc::new(decode_primitive::<DurationMicrosecondType>(rows, options))
        }
        DataType::Duration(TimeUnit::Nanosecond) => {
            Arc::new(decode_primitive::<DurationNanosecondType>(rows, options))
        }
        DataType::Interval(IntervalUnit::DayTime) => {
            Arc::new(decode_primitive::<IntervalDayTimeType>(rows, options))
        }
        DataType::Interval(IntervalUnit::MonthDayNano) => {
            Arc::new(decode_primitive::<IntervalMonthDayNanoType>(rows, options))
        }
        DataType::Interval(IntervalUnit::YearMonth) => {
            Arc::new(decode_primitive::<IntervalYearMonthType>(rows, options))
        }
        DataType::Binary => Arc::new(decode_binary::<i32>(rows, options)),
        DataType::LargeBinary => Arc::new(decode_binary::<i64>(rows, options)),
        DataType::Utf8 => Arc::new(decode_string::<i32>(rows, options)),
        DataType::LargeUtf8 => Arc::new(decode_string::<i64>(rows, options)),
        DataType::Decimal128(p, s) => {
            Arc::new(decode_decimal::<16, Decimal128Type>(rows, options, *p, *s))
        }
        DataType::Decimal256(p, s) => {
            Arc::new(decode_decimal::<32, Decimal256Type>(rows, options, *p, *s))
        }
        DataType::Dictionary(k, v) => match k.as_ref() {
            DataType::Int8 => Arc::new(decode_dictionary::<Int8Type>(
                interner.unwrap(),
                v.as_ref(),
                options,
                rows,
            )?),
            DataType::Int16 => Arc::new(decode_dictionary::<Int16Type>(
                interner.unwrap(),
                v.as_ref(),
                options,
                rows,
            )?),
            DataType::Int32 => Arc::new(decode_dictionary::<Int32Type>(
                interner.unwrap(),
                v.as_ref(),
                options,
                rows,
            )?),
            DataType::Int64 => Arc::new(decode_dictionary::<Int64Type>(
                interner.unwrap(),
                v.as_ref(),
                options,
                rows,
            )?),
            DataType::UInt8 => Arc::new(decode_dictionary::<UInt8Type>(
                interner.unwrap(),
                v.as_ref(),
                options,
                rows,
            )?),
            DataType::UInt16 => Arc::new(decode_dictionary::<UInt16Type>(
                interner.unwrap(),
                v.as_ref(),
                options,
                rows,
            )?),
            DataType::UInt32 => Arc::new(decode_dictionary::<UInt32Type>(
                interner.unwrap(),
                v.as_ref(),
                options,
                rows,
            )?),
            DataType::UInt64 => Arc::new(decode_dictionary::<UInt64Type>(
                interner.unwrap(),
                v.as_ref(),
                options,
                rows,
            )?),
            _ => {
                return Err(ArrowError::InvalidArgumentError(format!(
                    "{} is not a valid dictionary key type",
                    field.data_type
                )));
            }
        },
        DataType::FixedSizeBinary(_)
        | DataType::List(_)
        | DataType::FixedSizeList(_, _)
        | DataType::LargeList(_)
        | DataType::Struct(_)
        | DataType::Union(_, _, _)
        | DataType::Map(_, _) => {
            return Err(ArrowError::NotYetImplemented(format!(
                "converting {} row is not supported",
                field.data_type
            )))
        }
    };
    Ok(array)
}

#[cfg(test)]
mod tests {
    use std::sync::Arc;

    use rand::distributions::uniform::SampleUniform;
    use rand::distributions::{Distribution, Standard};
    use rand::{thread_rng, Rng};

    use arrow_array::NullArray;

    use crate::array::{
        BinaryArray, BooleanArray, DictionaryArray, Float32Array, GenericStringArray,
        Int16Array, Int32Array, OffsetSizeTrait, PrimitiveArray,
        PrimitiveDictionaryBuilder, StringArray,
    };
    use crate::compute::{LexicographicalComparator, SortColumn};
    use crate::util::display::array_value_to_string;

    use super::*;

    #[test]
    fn test_fixed_width() {
        let cols = [
            Arc::new(Int16Array::from_iter([
                Some(1),
                Some(2),
                None,
                Some(-5),
                Some(2),
                Some(2),
                Some(0),
            ])) as ArrayRef,
            Arc::new(Float32Array::from_iter([
                Some(1.3),
                Some(2.5),
                None,
                Some(4.),
                Some(0.1),
                Some(-4.),
                Some(-0.),
            ])) as ArrayRef,
        ];

        let mut converter = RowConverter::new(vec![
            SortField::new(DataType::Int16),
            SortField::new(DataType::Float32),
        ]);
        let rows = converter.convert_columns(&cols).unwrap();

        assert_eq!(rows.offsets.as_ref(), &[0, 8, 16, 24, 32, 40, 48, 56]);
        assert_eq!(
            rows.buffer.as_ref(),
            &[
                1, 128, 1, //
                1, 191, 166, 102, 102, //
                1, 128, 2, //
                1, 192, 32, 0, 0, //
                0, 0, 0, //
                0, 0, 0, 0, 0, //
                1, 127, 251, //
                1, 192, 128, 0, 0, //
                1, 128, 2, //
                1, 189, 204, 204, 205, //
                1, 128, 2, //
                1, 63, 127, 255, 255, //
                1, 128, 0, //
                1, 127, 255, 255, 255 //
            ]
        );

        assert!(rows.row(3) < rows.row(6));
        assert!(rows.row(0) < rows.row(1));
        assert!(rows.row(3) < rows.row(0));
        assert!(rows.row(4) < rows.row(1));
        assert!(rows.row(5) < rows.row(4));

        let back = converter.convert_rows(&rows).unwrap();
        for (expected, actual) in cols.iter().zip(&back) {
            assert_eq!(expected, actual);
        }
    }

    #[test]
    fn test_bool() {
        let mut converter = RowConverter::new(vec![SortField::new(DataType::Boolean)]);

        let col = Arc::new(BooleanArray::from_iter([None, Some(false), Some(true)]))
            as ArrayRef;

        let rows = converter.convert_columns(&[Arc::clone(&col)]).unwrap();
        assert!(rows.row(2) > rows.row(1));
        assert!(rows.row(2) > rows.row(0));
        assert!(rows.row(1) > rows.row(0));

        let cols = converter.convert_rows(&rows).unwrap();
        assert_eq!(&cols[0], &col);

        let mut converter = RowConverter::new(vec![SortField::new_with_options(
            DataType::Boolean,
            SortOptions {
                descending: true,
                nulls_first: false,
            },
        )]);

        let rows = converter.convert_columns(&[Arc::clone(&col)]).unwrap();
        assert!(rows.row(2) < rows.row(1));
        assert!(rows.row(2) < rows.row(0));
        assert!(rows.row(1) < rows.row(0));
        let cols = converter.convert_rows(&rows).unwrap();
        assert_eq!(&cols[0], &col);
    }

    #[test]
    fn test_null_encoding() {
        let col = Arc::new(NullArray::new(10));
        let mut converter = RowConverter::new(vec![SortField::new(DataType::Null)]);
        let rows = converter.convert_columns(&[col]).unwrap();
        assert_eq!(rows.num_rows(), 10);
        assert_eq!(rows.row(1).data.len(), 0);
    }

    #[test]
    fn test_variable_width() {
        let col = Arc::new(StringArray::from_iter([
            Some("hello"),
            Some("he"),
            None,
            Some("foo"),
            Some(""),
        ])) as ArrayRef;

        let mut converter = RowConverter::new(vec![SortField::new(DataType::Utf8)]);
        let rows = converter.convert_columns(&[Arc::clone(&col)]).unwrap();

        assert!(rows.row(1) < rows.row(0));
        assert!(rows.row(2) < rows.row(4));
        assert!(rows.row(3) < rows.row(0));
        assert!(rows.row(3) < rows.row(1));

        let cols = converter.convert_rows(&rows).unwrap();
        assert_eq!(&cols[0], &col);

        let col = Arc::new(BinaryArray::from_iter([
            None,
            Some(vec![0_u8; 0]),
            Some(vec![0_u8; 6]),
            Some(vec![0_u8; variable::BLOCK_SIZE]),
            Some(vec![0_u8; variable::BLOCK_SIZE + 1]),
            Some(vec![1_u8; 6]),
            Some(vec![1_u8; variable::BLOCK_SIZE]),
            Some(vec![1_u8; variable::BLOCK_SIZE + 1]),
            Some(vec![0xFF_u8; 6]),
            Some(vec![0xFF_u8; variable::BLOCK_SIZE]),
            Some(vec![0xFF_u8; variable::BLOCK_SIZE + 1]),
        ])) as ArrayRef;

        let mut converter = RowConverter::new(vec![SortField::new(DataType::Binary)]);
        let rows = converter.convert_columns(&[Arc::clone(&col)]).unwrap();

        for i in 0..rows.num_rows() {
            for j in i + 1..rows.num_rows() {
                assert!(
                    rows.row(i) < rows.row(j),
                    "{} < {} - {:?} < {:?}",
                    i,
                    j,
                    rows.row(i),
                    rows.row(j)
                );
            }
        }

        let cols = converter.convert_rows(&rows).unwrap();
        assert_eq!(&cols[0], &col);

        let mut converter = RowConverter::new(vec![SortField::new_with_options(
            DataType::Binary,
            SortOptions {
                descending: true,
                nulls_first: false,
            },
        )]);
        let rows = converter.convert_columns(&[Arc::clone(&col)]).unwrap();

        for i in 0..rows.num_rows() {
            for j in i + 1..rows.num_rows() {
                assert!(
                    rows.row(i) > rows.row(j),
                    "{} > {} - {:?} > {:?}",
                    i,
                    j,
                    rows.row(i),
                    rows.row(j)
                );
            }
        }

        let cols = converter.convert_rows(&rows).unwrap();
        assert_eq!(&cols[0], &col);
    }

    #[test]
    fn test_string_dictionary() {
        let a = Arc::new(DictionaryArray::<Int32Type>::from_iter([
            Some("foo"),
            Some("hello"),
            Some("he"),
            None,
            Some("hello"),
            Some(""),
            Some("hello"),
            Some("hello"),
        ])) as ArrayRef;

        let mut converter =
            RowConverter::new(vec![SortField::new(a.data_type().clone())]);
        let rows_a = converter.convert_columns(&[Arc::clone(&a)]).unwrap();

        assert!(rows_a.row(3) < rows_a.row(5));
        assert!(rows_a.row(2) < rows_a.row(1));
        assert!(rows_a.row(0) < rows_a.row(1));
        assert!(rows_a.row(3) < rows_a.row(0));

        assert_eq!(rows_a.row(1), rows_a.row(4));
        assert_eq!(rows_a.row(1), rows_a.row(6));
        assert_eq!(rows_a.row(1), rows_a.row(7));

        let cols = converter.convert_rows(&rows_a).unwrap();
        assert_eq!(&cols[0], &a);

        let b = Arc::new(DictionaryArray::<Int32Type>::from_iter([
            Some("hello"),
            None,
            Some("cupcakes"),
        ])) as ArrayRef;

        let rows_b = converter.convert_columns(&[Arc::clone(&b)]).unwrap();
        assert_eq!(rows_a.row(1), rows_b.row(0));
        assert_eq!(rows_a.row(3), rows_b.row(1));
        assert!(rows_b.row(2) < rows_a.row(0));

        let cols = converter.convert_rows(&rows_b).unwrap();
        assert_eq!(&cols[0], &b);

        let mut converter = RowConverter::new(vec![SortField::new_with_options(
            a.data_type().clone(),
            SortOptions {
                descending: true,
                nulls_first: false,
            },
        )]);

        let rows_c = converter.convert_columns(&[Arc::clone(&a)]).unwrap();
        assert!(rows_c.row(3) > rows_c.row(5));
        assert!(rows_c.row(2) > rows_c.row(1));
        assert!(rows_c.row(0) > rows_c.row(1));
        assert!(rows_c.row(3) > rows_c.row(0));

        let cols = converter.convert_rows(&rows_c).unwrap();
        assert_eq!(&cols[0], &a);
    }

    #[test]
    fn test_primitive_dictionary() {
        let mut builder = PrimitiveDictionaryBuilder::<Int32Type, Int32Type>::new();
        builder.append(2).unwrap();
        builder.append(3).unwrap();
        builder.append(0).unwrap();
        builder.append_null();
        builder.append(5).unwrap();
        builder.append(3).unwrap();
        builder.append(-1).unwrap();

        let a = builder.finish();

        let mut converter =
            RowConverter::new(vec![SortField::new(a.data_type().clone())]);
        let rows = converter.convert_columns(&[Arc::new(a)]).unwrap();
        assert!(rows.row(0) < rows.row(1));
        assert!(rows.row(2) < rows.row(0));
        assert!(rows.row(3) < rows.row(2));
        assert!(rows.row(6) < rows.row(2));
        assert!(rows.row(3) < rows.row(6));
    }

    #[test]
    fn test_dictionary_nulls() {
        let values =
            Int32Array::from_iter([Some(1), Some(-1), None, Some(4), None]).into_data();
        let keys =
            Int32Array::from_iter([Some(0), Some(0), Some(1), Some(2), Some(4), None])
                .into_data();

        let data_type =
            DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Int32));
        let data = keys
            .into_builder()
            .data_type(data_type.clone())
            .child_data(vec![values])
            .build()
            .unwrap();

        let mut converter = RowConverter::new(vec![SortField::new(data_type)]);
        let rows = converter
            .convert_columns(&[Arc::new(DictionaryArray::<Int32Type>::from(data))])
            .unwrap();

        assert_eq!(rows.row(0), rows.row(1));
        assert_eq!(rows.row(3), rows.row(4));
        assert_eq!(rows.row(4), rows.row(5));
        assert!(rows.row(3) < rows.row(0));
    }

    #[test]
    #[should_panic(expected = "rows were not produced by this RowConverter")]
    fn test_different_converter() {
        let values = Arc::new(Int32Array::from_iter([Some(1), Some(-1)]));
        let mut converter = RowConverter::new(vec![SortField::new(DataType::Int32)]);
        let rows = converter.convert_columns(&[values]).unwrap();

        let converter = RowConverter::new(vec![SortField::new(DataType::Int32)]);
        let _ = converter.convert_rows(&rows);
    }

    fn generate_primitive_array<K>(len: usize, valid_percent: f64) -> PrimitiveArray<K>
    where
        K: ArrowPrimitiveType,
        Standard: Distribution<K::Native>,
    {
        let mut rng = thread_rng();
        (0..len)
            .map(|_| rng.gen_bool(valid_percent).then(|| rng.gen()))
            .collect()
    }

    fn generate_strings<O: OffsetSizeTrait>(
        len: usize,
        valid_percent: f64,
    ) -> GenericStringArray<O> {
        let mut rng = thread_rng();
        (0..len)
            .map(|_| {
                rng.gen_bool(valid_percent).then(|| {
                    let len = rng.gen_range(0..100);
                    let bytes = (0..len).map(|_| rng.gen_range(0..128)).collect();
                    String::from_utf8(bytes).unwrap()
                })
            })
            .collect()
    }

    fn generate_dictionary<K>(
        values: ArrayRef,
        len: usize,
        valid_percent: f64,
    ) -> DictionaryArray<K>
    where
        K: ArrowDictionaryKeyType,
        K::Native: SampleUniform,
    {
        let mut rng = thread_rng();
        let min_key = K::Native::from_usize(0).unwrap();
        let max_key = K::Native::from_usize(values.len()).unwrap();
        let keys: PrimitiveArray<K> = (0..len)
            .map(|_| {
                rng.gen_bool(valid_percent)
                    .then(|| rng.gen_range(min_key..max_key))
            })
            .collect();

        let data_type = DataType::Dictionary(
            Box::new(K::DATA_TYPE),
            Box::new(values.data_type().clone()),
        );

        let data = keys
            .into_data()
            .into_builder()
            .data_type(data_type)
            .add_child_data(values.data().clone())
            .build()
            .unwrap();

        DictionaryArray::from(data)
    }

    fn generate_column(len: usize) -> ArrayRef {
        let mut rng = thread_rng();
        match rng.gen_range(0..9) {
            0 => Arc::new(generate_primitive_array::<Int32Type>(len, 0.8)),
            1 => Arc::new(generate_primitive_array::<UInt32Type>(len, 0.8)),
            2 => Arc::new(generate_primitive_array::<Int64Type>(len, 0.8)),
            3 => Arc::new(generate_primitive_array::<UInt64Type>(len, 0.8)),
            4 => Arc::new(generate_primitive_array::<Float32Type>(len, 0.8)),
            5 => Arc::new(generate_primitive_array::<Float64Type>(len, 0.8)),
            6 => Arc::new(generate_strings::<i32>(len, 0.8)),
            7 => Arc::new(generate_dictionary::<Int64Type>(
                // Cannot test dictionaries containing null values because of #2687
                Arc::new(generate_strings::<i32>(rng.gen_range(1..len), 1.0)),
                len,
                0.8,
            )),
            8 => Arc::new(generate_dictionary::<Int64Type>(
                // Cannot test dictionaries containing null values because of #2687
                Arc::new(generate_primitive_array::<Int64Type>(
                    rng.gen_range(1..len),
                    1.0,
                )),
                len,
                0.8,
            )),
            _ => unreachable!(),
        }
    }

    fn print_row(cols: &[SortColumn], row: usize) -> String {
        let t: Vec<_> = cols
            .iter()
            .map(|x| array_value_to_string(&x.values, row).unwrap())
            .collect();
        t.join(",")
    }

    fn print_col_types(cols: &[SortColumn]) -> String {
        let t: Vec<_> = cols
            .iter()
            .map(|x| x.values.data_type().to_string())
            .collect();
        t.join(",")
    }

    #[test]
    #[cfg_attr(miri, ignore)]
    fn fuzz_test() {
        for _ in 0..100 {
            let mut rng = thread_rng();
            let num_columns = rng.gen_range(1..5);
            let len = rng.gen_range(5..100);
            let arrays: Vec<_> = (0..num_columns).map(|_| generate_column(len)).collect();

            let options: Vec<_> = (0..num_columns)
                .map(|_| SortOptions {
                    descending: rng.gen_bool(0.5),
                    nulls_first: rng.gen_bool(0.5),
                })
                .collect();

            let sort_columns: Vec<_> = options
                .iter()
                .zip(&arrays)
                .map(|(o, c)| SortColumn {
                    values: Arc::clone(c),
                    options: Some(*o),
                })
                .collect();

            let comparator = LexicographicalComparator::try_new(&sort_columns).unwrap();

            let columns = options
                .into_iter()
                .zip(&arrays)
                .map(|(o, a)| SortField::new_with_options(a.data_type().clone(), o))
                .collect();

            let mut converter = RowConverter::new(columns);
            let rows = converter.convert_columns(&arrays).unwrap();

            for i in 0..len {
                for j in 0..len {
                    let row_i = rows.row(i);
                    let row_j = rows.row(j);
                    let row_cmp = row_i.cmp(&row_j);
                    let lex_cmp = comparator.compare(&i, &j);
                    assert_eq!(
                        row_cmp,
                        lex_cmp,
                        "({:?} vs {:?}) vs ({:?} vs {:?}) for types {}",
                        print_row(&sort_columns, i),
                        print_row(&sort_columns, j),
                        row_i,
                        row_j,
                        print_col_types(&sort_columns)
                    );
                }
            }

            let back = converter.convert_rows(&rows).unwrap();
            for (actual, expected) in back.iter().zip(&arrays) {
                assert_eq!(actual, expected)
            }
        }
    }
}
