Skip to content
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 107 additions & 13 deletions core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/DataColumn.kt
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,14 @@ import org.jetbrains.kotlinx.dataframe.impl.columns.ColumnGroupImpl
import org.jetbrains.kotlinx.dataframe.impl.columns.FrameColumnImpl
import org.jetbrains.kotlinx.dataframe.impl.columns.ValueColumnImpl
import org.jetbrains.kotlinx.dataframe.impl.columns.addPath
import org.jetbrains.kotlinx.dataframe.impl.columns.guessColumnType
import org.jetbrains.kotlinx.dataframe.impl.columns.createColumnGuessingType
import org.jetbrains.kotlinx.dataframe.impl.columns.toColumnKind
import org.jetbrains.kotlinx.dataframe.impl.getValuesType
import org.jetbrains.kotlinx.dataframe.impl.splitByIndices
import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema
import org.jetbrains.kotlinx.dataframe.util.CREATE_FRAME_COLUMN
import org.jetbrains.kotlinx.dataframe.util.CREATE_FRAME_COLUMN_IMPORT
import org.jetbrains.kotlinx.dataframe.util.CREATE_FRAME_COLUMN_REPLACE
import kotlin.reflect.KClass
import kotlin.reflect.KProperty
import kotlin.reflect.KType
Expand All @@ -45,6 +48,9 @@ public interface DataColumn<out T> : BaseColumn<T> {
/**
* Creates [ValueColumn] using given [name], [values] and [type].
*
* Be careful; values are NOT checked to adhere to [type] for efficiency,
* unless you specify [infer].
*
* @param name name of the column
* @param values list of column values
* @param type type of the column
Expand All @@ -56,11 +62,20 @@ public interface DataColumn<out T> : BaseColumn<T> {
type: KType,
infer: Infer = Infer.None,
defaultValue: T? = null,
): ValueColumn<T> = ValueColumnImpl(values, name, getValuesType(values, type, infer), defaultValue)
): ValueColumn<T> =
ValueColumnImpl(
values = values,
name = name,
type = getValuesType(values, type, infer),
defaultValue = defaultValue,
)

/**
* Creates [ValueColumn] using given [name], [values] and reified column [type].
*
* Be careful; values are NOT checked to adhere to [type] for efficiency,
* unless you specify [infer].
*
* Note, that column [type] will be defined at compile-time using [T] argument
*
* @param T type of the column
Expand All @@ -74,33 +89,92 @@ public interface DataColumn<out T> : BaseColumn<T> {
infer: Infer = Infer.None,
): ValueColumn<T> =
createValueColumn(
name,
values,
getValuesType(
values,
typeOf<T>(),
infer,
),
name = name,
values = values,
type = typeOf<T>(),
infer = infer,
)

/**
* Creates [ColumnGroup] using the given [name] and [df] representing the group of columns.
*
* @param name name of the column group
* @param df the collection of columns representing the column group
*/
public fun <T> createColumnGroup(name: String, df: DataFrame<T>): ColumnGroup<T> = ColumnGroupImpl(name, df)

@Deprecated(
message = CREATE_FRAME_COLUMN,
replaceWith = ReplaceWith(CREATE_FRAME_COLUMN_REPLACE, CREATE_FRAME_COLUMN_IMPORT),
level = DeprecationLevel.WARNING,
)
public fun <T> createFrameColumn(name: String, df: DataFrame<T>, startIndices: Iterable<Int>): FrameColumn<T> =
FrameColumnImpl(name, df.splitByIndices(startIndices.asSequence()).toList(), lazy { df.schema() })

/**
* Creates [FrameColumn] using the given [name] and list of dataframes [groups].
*
* Be careful; [groups] must be a non-null list of [DataFrames][DataFrame].
* This is NOT checked at runtime for efficiency, nor is the validity of given [schema].
*
* @param name name of the frame column
* @param groups the dataframes to be put in the column
* @param schema an optional (lazily calculated) [DataFrameSchema] representing
* the intersecting schema of [groups]
*/
public fun <T> createFrameColumn(
name: String,
groups: List<DataFrame<T>>,
schema: Lazy<DataFrameSchema>? = null,
): FrameColumn<T> = FrameColumnImpl(name, groups, schema)

/**
* Creates either a [FrameColumn], [ColumnGroup], or [ValueColumn] by analyzing each value in
* [values].
* This is safer but less efficient than the other functions.
*
* Some conversions are done automatically to attempt to unify the values, like:
* - `null` -> [DataFrame.empty][DataFrame.empty]`()` and [DataRow] -> single-row [DataFrame] when there are other
* [DataFrames][DataFrame] present in [values]
* - [List][List]`<`[DataRow][DataRow]`<*>>` -> [DataFrame]
* etc.
*
* @param name name of the column
* @param values the values to represent each row in the column
* @param nullable optionally you can specify whether [values] contains nulls, if `null` it is inferred.
* @param allColsMakesColGroup if `true`, then, if all values are non-null same-sized columns,
* a column group will be created instead of a [DataColumn][DataColumn]`<`[AnyCol][AnyCol]`>`.
*/
public fun <T> createWithTypeInference(
name: String,
values: List<T>,
nullable: Boolean? = null,
): DataColumn<T> = guessColumnType(name, values, nullable = nullable)
allColsMakesColGroup: Boolean = false,
): DataColumn<T> =
createColumnGuessingType(
name = name,
values = values,
nullable = nullable,
allColsMakesColGroup = allColsMakesColGroup,
)

public fun <T> create(
/**
* Calls [createColumnGroup], [createFrameColumn], or [createValueColumn] based on
* [type].
*
* Be careful; Values in [values] are NOT checked to adhere to the given [type], nor
* do we check whether there are nulls among the values when the given type is [DataFrame]
* (a [FrameColumn] cannot contain `null`, this causes runtime exceptions).
* When [type] is `DataFrame<*>?`, a [ValueColumn] is created to avoid this issue.
*
* This may be unsafe but is more efficient than [createWithTypeInference].
*
* @param name the name of the column
* @param values the values to represent each row in the column
* @param type the (unchecked) common type of [values]
* @param infer in case a [ValueColumn] is created, this controls how/whether types need to be inferred
*/
public fun <T> createUnsafe(
name: String,
values: List<T>,
type: KType,
Expand All @@ -112,9 +186,29 @@ public interface DataColumn<out T> : BaseColumn<T> {
ColumnKind.Frame -> createFrameColumn(name, values as List<AnyFrame>).asDataColumn().cast()
}

public inline fun <reified T> create(name: String, values: List<T>, infer: Infer = Infer.None): DataColumn<T> =
create(name, values, typeOf<T>(), infer)
/**
* Calls [createColumnGroup], [createFrameColumn], or [createValueColumn] based on
* type [T].
*
* Be careful; Values in [values] are NOT checked to adhere to the given [type], nor
* do we check whether there are nulls among the values when the given type is [DataFrame]
* (a [FrameColumn] cannot contain `null`, this causes runtime exceptions).
* When [type] is `DataFrame<*>?`, a [ValueColumn] is created to avoid this issue.
*
* This may be unsafe but is more efficient than [createWithTypeInference].
*
* @param T the (unchecked) common type of [values]
* @param name the name of the column
* @param values the values to represent each row in the column
* @param infer in case a [ValueColumn] is created, this controls how/whether types need to be inferred
*/
public inline fun <reified T> createUnsafe(
name: String,
values: List<T>,
infer: Infer = Infer.None,
): DataColumn<T> = createUnsafe(name, values, typeOf<T>(), infer)

/** Creates an empty [DataColumn] with given [name]. */
public fun empty(name: String = ""): AnyCol = createValueColumn(name, emptyList<Unit>(), typeOf<Unit>())
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -234,16 +234,22 @@ public enum class Infer {

/**
* Use reified type argument of an inline [DataFrame] operation as [DataColumn.type].
*
* This is the most efficient but least safe option.
*/
None,

/**
* Use reified type argument of an inline [DataFrame] operation as [DataColumn.type], but compute [DataColumn.hasNulls] by checking column [DataColumn.values] for an actual presence of *null* values.
* Use reified type argument of an inline [DataFrame] operation as [DataColumn.type],
* but compute [DataColumn.hasNulls] by checking column [DataColumn.values] for an actual presence of `null` values.
*/
Nulls,

/**
* Infer [DataColumn.type] and [DataColumn.hasNulls] from actual [DataColumn.values] using optionally provided base type as an upper bound.
* Infer [DataColumn.type] and [DataColumn.hasNulls] from actual [DataColumn.values] using an optionally provided
* base type as an upper bound.
*
* This is the least efficient but safest option.
*/
Type,

Expand Down Expand Up @@ -306,17 +312,17 @@ public inline fun <reified T> Iterable<T>.toColumn(name: String = "", infer: Inf
if (infer == Infer.Type) {
DataColumn.createWithTypeInference(name, asList())
} else {
DataColumn.create(name, asList(), typeOf<T>(), infer)
DataColumn.createUnsafe(name, asList(), typeOf<T>(), infer)
}.forceResolve()

public inline fun <reified T> Iterable<*>.toColumnOf(name: String = ""): DataColumn<T> =
DataColumn.create(name, asList() as List<T>, typeOf<T>()).forceResolve()
DataColumn.createUnsafe(name, asList() as List<T>, typeOf<T>()).forceResolve()

public inline fun <reified T> Iterable<T>.toColumn(ref: ColumnReference<T>): DataColumn<T> =
DataColumn.create(ref.name(), asList()).forceResolve()
DataColumn.createUnsafe(ref.name(), asList()).forceResolve()

public inline fun <reified T> Iterable<T>.toColumn(property: KProperty<T>): DataColumn<T> =
DataColumn.create(property.columnName, asList()).forceResolve()
DataColumn.createUnsafe(property.columnName, asList()).forceResolve()

public fun Iterable<String>.toPath(): ColumnPath = ColumnPath(asList())

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,18 @@ import org.jetbrains.kotlinx.dataframe.DataRow
import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup
import org.jetbrains.kotlinx.dataframe.columns.FrameColumn
import org.jetbrains.kotlinx.dataframe.columns.ValueColumn
import org.jetbrains.kotlinx.dataframe.impl.api.chunkedImpl
import org.jetbrains.kotlinx.dataframe.impl.getListType
import org.jetbrains.kotlinx.dataframe.nrow
import org.jetbrains.kotlinx.dataframe.type

/**
* Creates a [FrameColumn] from [this] by splitting the dataframe into
* smaller ones, with their number of rows at most [size].
*/
public fun <T> DataFrame<T>.chunked(size: Int, name: String = "groups"): FrameColumn<T> {
val startIndices = (0 until nrow step size)
return DataColumn.createFrameColumn(name, this, startIndices)
return this.chunkedImpl(startIndices, name)
}

public fun <T> DataColumn<T>.chunked(size: Int): ValueColumn<List<T>> {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import org.jetbrains.kotlinx.dataframe.impl.api.withValuesImpl
import org.jetbrains.kotlinx.dataframe.impl.asList
import org.jetbrains.kotlinx.dataframe.impl.columnName
import org.jetbrains.kotlinx.dataframe.impl.columns.ColumnAccessorImpl
import org.jetbrains.kotlinx.dataframe.impl.columns.createColumn
import org.jetbrains.kotlinx.dataframe.impl.columns.createColumnGuessingType
import org.jetbrains.kotlinx.dataframe.impl.columns.createComputedColumnReference
import org.jetbrains.kotlinx.dataframe.impl.columns.forceResolve
import org.jetbrains.kotlinx.dataframe.impl.columns.unbox
Expand Down Expand Up @@ -223,7 +223,13 @@ public class ColumnDelegate<T>(private val parent: ColumnGroupReference? = null)
// region create DataColumn

public inline fun <reified T> columnOf(vararg values: T): DataColumn<T> =
createColumn(values.asIterable(), typeOf<T>(), true).forceResolve()
createColumnGuessingType(
values = values.asIterable(),
suggestedType = typeOf<T>(),
guessTypeWithSuggestedAsUpperbound = true,
listifyValues = false,
allColsMakesColGroup = true,
).forceResolve()

public fun columnOf(vararg values: AnyBaseCol): DataColumn<AnyRow> = columnOf(values.asIterable()).forceResolve()

Expand All @@ -244,7 +250,12 @@ public fun <T> columnOf(frames: Iterable<DataFrame<T>>): FrameColumn<T> =
).forceResolve()

public inline fun <reified T> column(values: Iterable<T>): DataColumn<T> =
createColumn(values, typeOf<T>(), false).forceResolve()
createColumnGuessingType(
values = values,
suggestedType = typeOf<T>(),
guessTypeWithSuggestedAsUpperbound = false,
allColsMakesColGroup = true,
).forceResolve()

// endregion

Expand Down Expand Up @@ -274,8 +285,10 @@ public fun dataFrameOf(vararg columns: AnyBaseCol): DataFrame<*> = dataFrameOf(c
@Interpretable("DataFrameOf0")
public fun dataFrameOf(vararg header: String): DataFrameBuilder = dataFrameOf(header.toList())

public inline fun <reified C> dataFrameOf(vararg header: String, fill: (String) -> Iterable<C>): DataFrame<*> =
dataFrameOf(header.asIterable(), fill)
public inline fun <reified C> dataFrameOf(
vararg header: String,
crossinline fill: (String) -> Iterable<C>,
): DataFrame<*> = dataFrameOf(header.asIterable()).invoke(fill)

public fun dataFrameOf(header: Iterable<String>): DataFrameBuilder = DataFrameBuilder(header.asList())

Expand All @@ -289,9 +302,12 @@ public fun dataFrameOf(header: Iterable<String>, values: Iterable<Any?>): DataFr

public inline fun <T, reified C> dataFrameOf(header: Iterable<T>, fill: (T) -> Iterable<C>): DataFrame<*> =
header.map { value ->
fill(value).asList().let {
DataColumn.create(value.toString(), it)
}
createColumnGuessingType(
name = value.toString(),
values = fill(value).asList(),
suggestedType = typeOf<C>(),
guessTypeWithSuggestedAsUpperbound = true,
)
}.toDataFrame()

public fun dataFrameOf(header: CharProgression): DataFrameBuilder = dataFrameOf(header.map { it.toString() })
Expand Down Expand Up @@ -320,16 +336,19 @@ public class DataFrameBuilder(private val header: List<String>) {

public operator fun invoke(args: Sequence<Any?>): DataFrame<*> = invoke(*args.toList().toTypedArray())

public fun withColumns(columnBuilder: (String) -> AnyCol): DataFrame<*> = header.map(columnBuilder).toDataFrame()
public fun withColumns(columnBuilder: (String) -> AnyCol): DataFrame<*> =
header
.map { columnBuilder(it) named it } // create a columns and make sure to rename them to the given header
.toDataFrame()

public inline operator fun <reified T> invoke(crossinline valuesBuilder: (String) -> Iterable<T>): DataFrame<*> =
withColumns { name ->
valuesBuilder(name).let {
DataColumn.create(
name = name,
values = it.asList(),
)
}
createColumnGuessingType(
name = name,
values = valuesBuilder(name).asList(),
suggestedType = typeOf<T>(),
guessTypeWithSuggestedAsUpperbound = true,
)
}

public inline fun <reified C> fill(nrow: Int, value: C): DataFrame<*> =
Expand All @@ -341,30 +360,39 @@ public class DataFrameBuilder(private val header: List<String>) {
)
}

public fun fill(nrow: Int, dataFrame: AnyFrame): DataFrame<*> =
withColumns { name ->
DataColumn.createFrameColumn(
name = name,
groups = List(nrow) { dataFrame },
schema = lazy { dataFrame.schema() },
)
}

public inline fun <reified C> nulls(nrow: Int): DataFrame<*> = fill<C?>(nrow, null)

public inline fun <reified C> fillIndexed(nrow: Int, crossinline init: (Int, String) -> C): DataFrame<*> =
withColumns { name ->
DataColumn.create(
name,
List(nrow) { init(it, name) },
DataColumn.createWithTypeInference(
name = name,
values = List(nrow) { init(it, name) },
)
}

public inline fun <reified C> fill(nrow: Int, crossinline init: (Int) -> C): DataFrame<*> =
withColumns { name ->
DataColumn.create(
DataColumn.createWithTypeInference(
name = name,
values = List(nrow, init),
)
}

private inline fun <reified C> fillNotNull(nrow: Int, crossinline init: (Int) -> C) =
private inline fun <reified C> fillNotNull(nrow: Int, crossinline init: (Int) -> C & Any) =
withColumns { name ->
DataColumn.createValueColumn(
name = name,
values = List(nrow, init),
type = typeOf<C>(),
type = typeOf<C>().withNullability(false),
)
}

Expand Down
Loading
Loading