Kotlin · Jolanrensen · Oct 31, 2024 · Oct 17, 2024 · Oct 17, 2024 · Oct 17, 2024
diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/DataColumn.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/DataColumn.kt
@@ -20,11 +20,14 @@ import org.jetbrains.kotlinx.dataframe.impl.columns.ColumnGroupImpl
 import org.jetbrains.kotlinx.dataframe.impl.columns.FrameColumnImpl
 import org.jetbrains.kotlinx.dataframe.impl.columns.ValueColumnImpl
 import org.jetbrains.kotlinx.dataframe.impl.columns.addPath
-import org.jetbrains.kotlinx.dataframe.impl.columns.guessColumnType
+import org.jetbrains.kotlinx.dataframe.impl.columns.createColumnGuessingType
 import org.jetbrains.kotlinx.dataframe.impl.columns.toColumnKind
 import org.jetbrains.kotlinx.dataframe.impl.getValuesType
 import org.jetbrains.kotlinx.dataframe.impl.splitByIndices
 import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema
+import org.jetbrains.kotlinx.dataframe.util.CREATE_FRAME_COLUMN
+import org.jetbrains.kotlinx.dataframe.util.CREATE_FRAME_COLUMN_IMPORT
+import org.jetbrains.kotlinx.dataframe.util.CREATE_FRAME_COLUMN_REPLACE
 import kotlin.reflect.KClass
 import kotlin.reflect.KProperty
 import kotlin.reflect.KType
@@ -45,6 +48,9 @@ public interface DataColumn<out T> : BaseColumn<T> {
         /**
          * Creates [ValueColumn] using given [name], [values] and [type].
          *
+         * Be careful; values are NOT checked to adhere to [type] for efficiency,
+         * unless you specify [infer].
+         *
          * @param name name of the column
          * @param values list of column values
          * @param type type of the column
@@ -56,11 +62,20 @@ public interface DataColumn<out T> : BaseColumn<T> {
             type: KType,
             infer: Infer = Infer.None,
             defaultValue: T? = null,
-        ): ValueColumn<T> = ValueColumnImpl(values, name, getValuesType(values, type, infer), defaultValue)
+        ): ValueColumn<T> =
+            ValueColumnImpl(
+                values = values,
+                name = name,
+                type = getValuesType(values, type, infer),
+                defaultValue = defaultValue,
+            )
 
         /**
          * Creates [ValueColumn] using given [name], [values] and reified column [type].
          *
+         * Be careful; values are NOT checked to adhere to [type] for efficiency,
+         * unless you specify [infer].
+         *
          * Note, that column [type] will be defined at compile-time using [T] argument
          *
          * @param T type of the column
@@ -74,33 +89,92 @@ public interface DataColumn<out T> : BaseColumn<T> {
             infer: Infer = Infer.None,
         ): ValueColumn<T> =
             createValueColumn(
-                name,
-                values,
-                getValuesType(
-                    values,
-                    typeOf<T>(),
-                    infer,
-                ),
+                name = name,
+                values = values,
+                type = typeOf<T>(),
+                infer = infer,
             )
 
+        /**
+         * Creates [ColumnGroup] using the given [name] and [df] representing the group of columns.
+         *
+         * @param name name of the column group
+         * @param df the collection of columns representing the column group
+         */
         public fun <T> createColumnGroup(name: String, df: DataFrame<T>): ColumnGroup<T> = ColumnGroupImpl(name, df)
 
+        @Deprecated(
+            message = CREATE_FRAME_COLUMN,
+            replaceWith = ReplaceWith(CREATE_FRAME_COLUMN_REPLACE, CREATE_FRAME_COLUMN_IMPORT),
+            level = DeprecationLevel.WARNING,
+        )
         public fun <T> createFrameColumn(name: String, df: DataFrame<T>, startIndices: Iterable<Int>): FrameColumn<T> =
             FrameColumnImpl(name, df.splitByIndices(startIndices.asSequence()).toList(), lazy { df.schema() })
 
+        /**
+         * Creates [FrameColumn] using the given [name] and list of dataframes [groups].
+         *
+         * Be careful; [groups] must be a non-null list of [DataFrames][DataFrame].
+         * This is NOT checked at runtime for efficiency, nor is the validity of given [schema].
+         *
+         * @param name name of the frame column
+         * @param groups the dataframes to be put in the column
+         * @param schema an optional (lazily calculated) [DataFrameSchema] representing
+         *   the intersecting schema of [groups]
+         */
         public fun <T> createFrameColumn(
             name: String,
             groups: List<DataFrame<T>>,
             schema: Lazy<DataFrameSchema>? = null,
         ): FrameColumn<T> = FrameColumnImpl(name, groups, schema)
 
+        /**
+         * Creates either a [FrameColumn], [ColumnGroup], or [ValueColumn] by analyzing each value in
+         * [values].
+         * This is safer but less efficient than the other functions.
+         *
+         * Some conversions are done automatically to attempt to unify the values, like:
+         * - `null` -> [DataFrame.empty][DataFrame.empty]`()` and [DataRow] -> single-row [DataFrame] when there are other
+         *   [DataFrames][DataFrame] present in [values]
+         * - [List][List]`<`[DataRow][DataRow]`<*>>` -> [DataFrame]
+         * etc.
+         *
+         * @param name name of the column
+         * @param values the values to represent each row in the column
+         * @param nullable optionally you can specify whether [values] contains nulls, if `null` it is inferred.
+         * @param allColsMakesColGroup if `true`, then, if all values are non-null same-sized columns,
+         *   a column group will be created instead of a [DataColumn][DataColumn]`<`[AnyCol][AnyCol]`>`.
+         */
         public fun <T> createWithTypeInference(
             name: String,
             values: List<T>,
             nullable: Boolean? = null,
-        ): DataColumn<T> = guessColumnType(name, values, nullable = nullable)
+            allColsMakesColGroup: Boolean = false,
+        ): DataColumn<T> =
+            createColumnGuessingType(
+                name = name,
+                values = values,
+                nullable = nullable,
+                allColsMakesColGroup = allColsMakesColGroup,
+            )
 
-        public fun <T> create(
+        /**
+         * Calls [createColumnGroup], [createFrameColumn], or [createValueColumn] based on
+         * [type].
+         *
+         * Be careful; Values in [values] are NOT checked to adhere to the given [type], nor
+         * do we check whether there are nulls among the values when the given type is [DataFrame]
+         * (a [FrameColumn] cannot contain `null`, this causes runtime exceptions).
+         * When [type] is `DataFrame<*>?`, a [ValueColumn] is created to avoid this issue.
+         *
+         * This may be unsafe but is more efficient than [createWithTypeInference].
+         *
+         * @param name the name of the column
+         * @param values the values to represent each row in the column
+         * @param type the (unchecked) common type of [values]
+         * @param infer in case a [ValueColumn] is created, this controls how/whether types need to be inferred
+         */
+        public fun <T> createUnsafe(
             name: String,
             values: List<T>,
             type: KType,
@@ -112,9 +186,29 @@ public interface DataColumn<out T> : BaseColumn<T> {
                 ColumnKind.Frame -> createFrameColumn(name, values as List<AnyFrame>).asDataColumn().cast()
             }
 
-        public inline fun <reified T> create(name: String, values: List<T>, infer: Infer = Infer.None): DataColumn<T> =
-            create(name, values, typeOf<T>(), infer)
+        /**
+         * Calls [createColumnGroup], [createFrameColumn], or [createValueColumn] based on
+         * type [T].
+         *
+         * Be careful; Values in [values] are NOT checked to adhere to the given [type], nor
+         * do we check whether there are nulls among the values when the given type is [DataFrame]
+         * (a [FrameColumn] cannot contain `null`, this causes runtime exceptions).
+         * When [type] is `DataFrame<*>?`, a [ValueColumn] is created to avoid this issue.
+         *
+         * This may be unsafe but is more efficient than [createWithTypeInference].
+         *
+         * @param T the (unchecked) common type of [values]
+         * @param name the name of the column
+         * @param values the values to represent each row in the column
+         * @param infer in case a [ValueColumn] is created, this controls how/whether types need to be inferred
+         */
+        public inline fun <reified T> createUnsafe(
+            name: String,
+            values: List<T>,
+            infer: Infer = Infer.None,
+        ): DataColumn<T> = createUnsafe(name, values, typeOf<T>(), infer)
 
+        /** Creates an empty [DataColumn] with given [name]. */
         public fun empty(name: String = ""): AnyCol = createValueColumn(name, emptyList<Unit>(), typeOf<Unit>())
     }
 

diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/TypeConversions.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/TypeConversions.kt
@@ -234,16 +234,22 @@ public enum class Infer {
 
     /**
      * Use reified type argument of an inline [DataFrame] operation as [DataColumn.type].
+     *
+     * This is the most efficient but least safe option.
      */
     None,
 
     /**
-     * Use reified type argument of an inline [DataFrame] operation as [DataColumn.type], but compute [DataColumn.hasNulls] by checking column [DataColumn.values] for an actual presence of *null* values.
+     * Use reified type argument of an inline [DataFrame] operation as [DataColumn.type],
+     * but compute [DataColumn.hasNulls] by checking column [DataColumn.values] for an actual presence of `null` values.
      */
     Nulls,
 
     /**
-     * Infer [DataColumn.type] and [DataColumn.hasNulls] from actual [DataColumn.values] using optionally provided base type as an upper bound.
+     * Infer [DataColumn.type] and [DataColumn.hasNulls] from actual [DataColumn.values] using an optionally provided
+     * base type as an upper bound.
+     *
+     * This is the least efficient but safest option.
      */
     Type,
 
@@ -306,17 +312,17 @@ public inline fun <reified T> Iterable<T>.toColumn(name: String = "", infer: Inf
     if (infer == Infer.Type) {
         DataColumn.createWithTypeInference(name, asList())
     } else {
-        DataColumn.create(name, asList(), typeOf<T>(), infer)
+        DataColumn.createUnsafe(name, asList(), typeOf<T>(), infer)
     }.forceResolve()
 
 public inline fun <reified T> Iterable<*>.toColumnOf(name: String = ""): DataColumn<T> =
-    DataColumn.create(name, asList() as List<T>, typeOf<T>()).forceResolve()
+    DataColumn.createUnsafe(name, asList() as List<T>, typeOf<T>()).forceResolve()
 
 public inline fun <reified T> Iterable<T>.toColumn(ref: ColumnReference<T>): DataColumn<T> =
-    DataColumn.create(ref.name(), asList()).forceResolve()
+    DataColumn.createUnsafe(ref.name(), asList()).forceResolve()
 
 public inline fun <reified T> Iterable<T>.toColumn(property: KProperty<T>): DataColumn<T> =
-    DataColumn.create(property.columnName, asList()).forceResolve()
+    DataColumn.createUnsafe(property.columnName, asList()).forceResolve()
 
 public fun Iterable<String>.toPath(): ColumnPath = ColumnPath(asList())
 

diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/chunked.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/chunked.kt
@@ -6,13 +6,18 @@ import org.jetbrains.kotlinx.dataframe.DataRow
 import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup
 import org.jetbrains.kotlinx.dataframe.columns.FrameColumn
 import org.jetbrains.kotlinx.dataframe.columns.ValueColumn
+import org.jetbrains.kotlinx.dataframe.impl.api.chunkedImpl
 import org.jetbrains.kotlinx.dataframe.impl.getListType
 import org.jetbrains.kotlinx.dataframe.nrow
 import org.jetbrains.kotlinx.dataframe.type
 
+/**
+ * Creates a [FrameColumn] from [this] by splitting the dataframe into
+ * smaller ones, with their number of rows at most [size].
+ */
 public fun <T> DataFrame<T>.chunked(size: Int, name: String = "groups"): FrameColumn<T> {
     val startIndices = (0 until nrow step size)
-    return DataColumn.createFrameColumn(name, this, startIndices)
+    return this.chunkedImpl(startIndices, name)
 }
 
 public fun <T> DataColumn<T>.chunked(size: Int): ValueColumn<List<T>> {

diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/constructors.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/constructors.kt
@@ -24,7 +24,7 @@ import org.jetbrains.kotlinx.dataframe.impl.api.withValuesImpl
 import org.jetbrains.kotlinx.dataframe.impl.asList
 import org.jetbrains.kotlinx.dataframe.impl.columnName
 import org.jetbrains.kotlinx.dataframe.impl.columns.ColumnAccessorImpl
-import org.jetbrains.kotlinx.dataframe.impl.columns.createColumn
+import org.jetbrains.kotlinx.dataframe.impl.columns.createColumnGuessingType
 import org.jetbrains.kotlinx.dataframe.impl.columns.createComputedColumnReference
 import org.jetbrains.kotlinx.dataframe.impl.columns.forceResolve
 import org.jetbrains.kotlinx.dataframe.impl.columns.unbox
@@ -223,7 +223,13 @@ public class ColumnDelegate<T>(private val parent: ColumnGroupReference? = null)
 // region create DataColumn
 
 public inline fun <reified T> columnOf(vararg values: T): DataColumn<T> =
-    createColumn(values.asIterable(), typeOf<T>(), true).forceResolve()
+    createColumnGuessingType(
+        values = values.asIterable(),
+        suggestedType = typeOf<T>(),
+        guessTypeWithSuggestedAsUpperbound = true,
+        listifyValues = false,
+        allColsMakesColGroup = true,
+    ).forceResolve()
 
 public fun columnOf(vararg values: AnyBaseCol): DataColumn<AnyRow> = columnOf(values.asIterable()).forceResolve()
 
@@ -244,7 +250,12 @@ public fun <T> columnOf(frames: Iterable<DataFrame<T>>): FrameColumn<T> =
     ).forceResolve()
 
 public inline fun <reified T> column(values: Iterable<T>): DataColumn<T> =
-    createColumn(values, typeOf<T>(), false).forceResolve()
+    createColumnGuessingType(
+        values = values,
+        suggestedType = typeOf<T>(),
+        guessTypeWithSuggestedAsUpperbound = false,
+        allColsMakesColGroup = true,
+    ).forceResolve()
 
 // endregion
 
@@ -274,8 +285,10 @@ public fun dataFrameOf(vararg columns: AnyBaseCol): DataFrame<*> = dataFrameOf(c
 @Interpretable("DataFrameOf0")
 public fun dataFrameOf(vararg header: String): DataFrameBuilder = dataFrameOf(header.toList())
 
-public inline fun <reified C> dataFrameOf(vararg header: String, fill: (String) -> Iterable<C>): DataFrame<*> =
-    dataFrameOf(header.asIterable(), fill)
+public inline fun <reified C> dataFrameOf(
+    vararg header: String,
+    crossinline fill: (String) -> Iterable<C>,
+): DataFrame<*> = dataFrameOf(header.asIterable()).invoke(fill)
 
 public fun dataFrameOf(header: Iterable<String>): DataFrameBuilder = DataFrameBuilder(header.asList())
 
@@ -289,9 +302,12 @@ public fun dataFrameOf(header: Iterable<String>, values: Iterable<Any?>): DataFr
 
 public inline fun <T, reified C> dataFrameOf(header: Iterable<T>, fill: (T) -> Iterable<C>): DataFrame<*> =
     header.map { value ->
-        fill(value).asList().let {
-            DataColumn.create(value.toString(), it)
-        }
+        createColumnGuessingType(
+            name = value.toString(),
+            values = fill(value).asList(),
+            suggestedType = typeOf<C>(),
+            guessTypeWithSuggestedAsUpperbound = true,
+        )
     }.toDataFrame()
 
 public fun dataFrameOf(header: CharProgression): DataFrameBuilder = dataFrameOf(header.map { it.toString() })
@@ -320,16 +336,19 @@ public class DataFrameBuilder(private val header: List<String>) {
 
     public operator fun invoke(args: Sequence<Any?>): DataFrame<*> = invoke(*args.toList().toTypedArray())
 
-    public fun withColumns(columnBuilder: (String) -> AnyCol): DataFrame<*> = header.map(columnBuilder).toDataFrame()
+    public fun withColumns(columnBuilder: (String) -> AnyCol): DataFrame<*> =
+        header
+            .map { columnBuilder(it) named it } // create a columns and make sure to rename them to the given header
+            .toDataFrame()
 
     public inline operator fun <reified T> invoke(crossinline valuesBuilder: (String) -> Iterable<T>): DataFrame<*> =
         withColumns { name ->
-            valuesBuilder(name).let {
-                DataColumn.create(
-                    name = name,
-                    values = it.asList(),
-                )
-            }
+            createColumnGuessingType(
+                name = name,
+                values = valuesBuilder(name).asList(),
+                suggestedType = typeOf<T>(),
+                guessTypeWithSuggestedAsUpperbound = true,
+            )
         }
 
     public inline fun <reified C> fill(nrow: Int, value: C): DataFrame<*> =
@@ -341,30 +360,39 @@ public class DataFrameBuilder(private val header: List<String>) {
             )
         }
 
+    public fun fill(nrow: Int, dataFrame: AnyFrame): DataFrame<*> =
+        withColumns { name ->
+            DataColumn.createFrameColumn(
+                name = name,
+                groups = List(nrow) { dataFrame },
+                schema = lazy { dataFrame.schema() },
+            )
+        }
+
     public inline fun <reified C> nulls(nrow: Int): DataFrame<*> = fill<C?>(nrow, null)
 
     public inline fun <reified C> fillIndexed(nrow: Int, crossinline init: (Int, String) -> C): DataFrame<*> =
         withColumns { name ->
-            DataColumn.create(
-                name,
-                List(nrow) { init(it, name) },
+            DataColumn.createWithTypeInference(
+                name = name,
+                values = List(nrow) { init(it, name) },
             )
         }
 
     public inline fun <reified C> fill(nrow: Int, crossinline init: (Int) -> C): DataFrame<*> =
         withColumns { name ->
-            DataColumn.create(
+            DataColumn.createWithTypeInference(
                 name = name,
                 values = List(nrow, init),
             )
         }
 
-    private inline fun <reified C> fillNotNull(nrow: Int, crossinline init: (Int) -> C) =
+    private inline fun <reified C> fillNotNull(nrow: Int, crossinline init: (Int) -> C & Any) =
         withColumns { name ->
             DataColumn.createValueColumn(
                 name = name,
                 values = List(nrow, init),
-                type = typeOf<C>(),
+                type = typeOf<C>().withNullability(false),
             )
         }