Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@ import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.columns.ColumnAccessor
import org.jetbrains.kotlinx.dataframe.columns.ColumnReference
import org.jetbrains.kotlinx.dataframe.columns.ColumnWithPath
import org.jetbrains.kotlinx.dataframe.columns.FrameColumn
import org.jetbrains.kotlinx.dataframe.columns.toColumnSet
import org.jetbrains.kotlinx.dataframe.impl.DELIMITED_STRING_REGEX
import org.jetbrains.kotlinx.dataframe.impl.DELIMITERS_REGEX
import org.jetbrains.kotlinx.dataframe.impl.api.renameImpl
import org.jetbrains.kotlinx.dataframe.impl.columnName
import org.jetbrains.kotlinx.dataframe.impl.toCamelCaseByDelimiters
import org.jetbrains.kotlinx.dataframe.util.ITERABLE_COLUMNS_DEPRECATION_MESSAGE
Expand Down Expand Up @@ -42,47 +44,47 @@ public fun <T, C> DataFrame<T>.rename(cols: Iterable<ColumnReference<C>>): Renam

public data class RenameClause<T, C>(val df: DataFrame<T>, val columns: ColumnsSelector<T, C>)

/**
* ## Rename to camelCase
*
* This function renames all columns to `camelCase` by replacing all [delimiters][DELIMITERS_REGEX]
* and converting the first char to lowercase.
* Even [DataFrames][DataFrame] inside [FrameColumns][FrameColumn] are traversed recursively.
*/
public fun <T> DataFrame<T>.renameToCamelCase(): DataFrame<T> = this
// recursively rename all column groups to camel case
// recursively rename all columns written with delimiters or starting with a capital to camel case
.rename {
groups { it.name() matches DELIMITED_STRING_REGEX }.recursively()
}.toCamelCase()

// recursively rename all other columns to camel case
.rename {
cols { !it.isColumnGroup() && it.name() matches DELIMITED_STRING_REGEX }.recursively()
cols { it.name() matches DELIMITED_STRING_REGEX || it.name[0].isUpperCase() }.recursively()
}.toCamelCase()

// take all frame columns recursively and call renameToCamelCase() on all dataframes inside
.update {
colsOf<AnyFrame>().recursively()
}.with { it.renameToCamelCase() }

// convert all first chars of all columns to the lowercase
.rename {
cols { !it.isColumnGroup() }.recursively()
}.into {
it.name.replaceFirstChar { it.lowercaseChar() }
}

public fun <T, C> RenameClause<T, C>.into(vararg newColumns: ColumnReference<*>): DataFrame<T> =
into(*newColumns.map { it.name() }.toTypedArray())

public fun <T, C> RenameClause<T, C>.into(vararg newNames: String): DataFrame<T> =
df.move(columns).intoIndexed { col, index ->
col.path.dropLast(1) + newNames[index]
}
renameImpl(newNames)

public fun <T, C> RenameClause<T, C>.into(vararg newNames: KProperty<*>): DataFrame<T> =
into(*newNames.map { it.name }.toTypedArray())

public fun <T, C> RenameClause<T, C>.into(transform: (ColumnWithPath<C>) -> String): DataFrame<T> =
df.move(columns).into {
it.path.dropLast(1) + transform(it)
}

public fun <T, C> RenameClause<T, C>.toCamelCase(): DataFrame<T> =
into { it.name().toCamelCaseByDelimiters(DELIMITERS_REGEX) }
renameImpl(transform)

/**
* ## Rename to camelCase
*
* Renames the selected columns to `camelCase` by replacing all [delimiters][DELIMITERS_REGEX]
* and converting the first char to lowercase.
*/
public fun <T, C> RenameClause<T, C>.toCamelCase(): DataFrame<T> = into {
it.name()
.toCamelCaseByDelimiters(DELIMITERS_REGEX)
.replaceFirstChar { it.lowercaseChar() }
}

// endregion

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ public fun <T, C> ReplaceClause<T, C>.with(newColumns: List<AnyCol>): DataFrame<
}
}

/* TODO: Issue #418: breaks if running on ColumnGroup and its child */
public fun <T, C> ReplaceClause<T, C>.with(transform: ColumnsContainer<T>.(DataColumn<C>) -> AnyBaseCol): DataFrame<T> {
val removeResult = df.removeImpl(columns = columns)
val toInsert = removeResult.removedColumns.map {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
package org.jetbrains.kotlinx.dataframe.impl.api

import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.api.RenameClause
import org.jetbrains.kotlinx.dataframe.api.cast
import org.jetbrains.kotlinx.dataframe.api.getColumnsWithPaths
import org.jetbrains.kotlinx.dataframe.api.insert
import org.jetbrains.kotlinx.dataframe.api.under
import org.jetbrains.kotlinx.dataframe.columns.ColumnKind
import org.jetbrains.kotlinx.dataframe.columns.ColumnWithPath
import org.jetbrains.kotlinx.dataframe.impl.columns.tree.allChildrenNotNull
import org.jetbrains.kotlinx.dataframe.impl.columns.tree.collectTree
import org.jetbrains.kotlinx.dataframe.kind

internal fun <T, C> RenameClause<T, C>.renameImpl(newNames: Array<out String>): DataFrame<T> {
var i = 0
return renameImpl { newNames[i++] }
}

internal fun <T, C> RenameClause<T, C>.renameImpl(transform: (ColumnWithPath<C>) -> String): DataFrame<T> {
val selectedColumns = df.getColumnsWithPaths(columns)
val tree = df.getColumnsWithPaths { all().rec() }.collectTree()

// perform rename in nodes
tree.allChildrenNotNull().forEach { node ->
val column = selectedColumns.find { it.data == node.data } ?: return@forEach
val newName = transform(column)
node.name = newName
}

// build up a new DataFrame using the modified names
var newDf = DataFrame.empty(df.rowsCount()).cast<T>()
tree.allChildrenNotNull().forEach { node ->
val path = node.pathFromRoot().dropLast(1)
val col = node.data.rename(node.name)

when (col.kind) {
ColumnKind.Value, ColumnKind.Frame ->
newDf = newDf.insert(col).under(path)

ColumnKind.Group -> Unit
}
}

return newDf
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ package org.jetbrains.kotlinx.dataframe.impl.columns.tree
import org.jetbrains.kotlinx.dataframe.columns.ColumnPath

internal class TreeNode<T>(
override val name: String,
override var name: String,
override val depth: Int,
override var data: T,
override val parent: TreeNode<T>? = null,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,76 @@ import org.jetbrains.kotlinx.dataframe.impl.columns.asAnyFrameColumn
import org.junit.Test

class RenameTests {
companion object {
val simpleDf = dataFrameOf("a", "b", "c")(
1, 2, 3,
4, 5, 6,
)
val groupedDf = simpleDf.group { "a" and "b" }.into("group")

val doubleGroupedDf = groupedDf.group { "group"["a"] }.into { "group"["aGroup"] }
}

@Test
fun `simple rename`() {
val renamedDf = dataFrameOf("a_renamed", "b_renamed", "c_renamed")(
1, 2, 3,
4, 5, 6,
)

simpleDf.rename { all() }.into { it.name + "_renamed" } shouldBe renamedDf
simpleDf.rename { all() }.into("a_renamed", "b_renamed", "c_renamed") shouldBe renamedDf
}

@Test
fun `partial grouped rename`() {
val renamedDf = dataFrameOf("a_renamed", "b", "c")(
1, 2, 3,
4, 5, 6,
).group { "a_renamed" and "b" }.into("group_renamed")

groupedDf
.rename { "group" and "group"["a"] }
.into { it.name + "_renamed" } shouldBe renamedDf
}

@Test
fun `grouped rename`() {
val renamedDf = dataFrameOf("a_renamed", "b_renamed", "c_renamed")(
1, 2, 3,
4, 5, 6,
).group { "a_renamed" and "b_renamed" }.into("group_renamed")

groupedDf
.rename { all().recursively() }
.into { it.name + "_renamed" } shouldBe renamedDf
}

@Test
fun `double grouped rename in 3 steps`() {
val renamedDf = dataFrameOf("a_renamed", "b_renamed", "c_renamed")(
1, 2, 3,
4, 5, 6,
).group { "a_renamed" and "b_renamed" }.into("group_renamed")
.group { "group_renamed"["a_renamed"] }.into { "group_renamed"["aGroup_renamed"] }

doubleGroupedDf
.rename { all().recursively() }
.into { it.name + "_renamed" } shouldBe renamedDf
}
}

class RenameToCamelCaseTests {
companion object {
val nestedDf = dataFrameOf("test_name")(dataFrameOf("another_name")(1))
val nestedColumnGroup = dataFrameOf("test_name")(
dataFrameOf("another_name")(1).first()
)
val doublyNestedColumnGroup = dataFrameOf("test_name")(
dataFrameOf("another_name")(
dataFrameOf("third_name")(1).first()
).first()
)
val deeplyNestedDf = kotlin.run {
val df = dataFrameOf("another_name")(1)
val rowWithDf = dataFrameOf("group_name")(df).first()
Expand All @@ -36,6 +101,20 @@ class RenameTests {
df.getColumnGroup("testName").columnNames() shouldBe listOf("anotherName")
}

@Test
fun `doubly nested row`() {
val doublyNestedColumnGroup = dataFrameOf("test_name")(
dataFrameOf("another_name")(
dataFrameOf("third_name")(1).first()
).first()
)

val df = doublyNestedColumnGroup.renameToCamelCase()//.alsoDebug()
df.columnNames() shouldBe listOf("testName")
df["testName"].asColumnGroup().columnNames() shouldBe listOf("anotherName")
df["testName"]["anotherName"].asColumnGroup().columnNames() shouldBe listOf("thirdName")
}

@Test
fun `deeply nested df`() {
val df = deeplyNestedDf.renameToCamelCase()
Expand Down
48 changes: 25 additions & 23 deletions core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/rename.kt
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@ import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.columns.ColumnAccessor
import org.jetbrains.kotlinx.dataframe.columns.ColumnReference
import org.jetbrains.kotlinx.dataframe.columns.ColumnWithPath
import org.jetbrains.kotlinx.dataframe.columns.FrameColumn
import org.jetbrains.kotlinx.dataframe.columns.toColumnSet
import org.jetbrains.kotlinx.dataframe.impl.DELIMITED_STRING_REGEX
import org.jetbrains.kotlinx.dataframe.impl.DELIMITERS_REGEX
import org.jetbrains.kotlinx.dataframe.impl.api.renameImpl
import org.jetbrains.kotlinx.dataframe.impl.columnName
import org.jetbrains.kotlinx.dataframe.impl.toCamelCaseByDelimiters
import org.jetbrains.kotlinx.dataframe.util.ITERABLE_COLUMNS_DEPRECATION_MESSAGE
Expand Down Expand Up @@ -42,47 +44,47 @@ public fun <T, C> DataFrame<T>.rename(cols: Iterable<ColumnReference<C>>): Renam

public data class RenameClause<T, C>(val df: DataFrame<T>, val columns: ColumnsSelector<T, C>)

/**
* ## Rename to camelCase
*
* This function renames all columns to `camelCase` by replacing all [delimiters][DELIMITERS_REGEX]
* and converting the first char to lowercase.
* Even [DataFrames][DataFrame] inside [FrameColumns][FrameColumn] are traversed recursively.
*/
public fun <T> DataFrame<T>.renameToCamelCase(): DataFrame<T> = this
// recursively rename all column groups to camel case
// recursively rename all columns written with delimiters or starting with a capital to camel case
.rename {
groups { it.name() matches DELIMITED_STRING_REGEX }.recursively()
}.toCamelCase()

// recursively rename all other columns to camel case
.rename {
cols { !it.isColumnGroup() && it.name() matches DELIMITED_STRING_REGEX }.recursively()
cols { it.name() matches DELIMITED_STRING_REGEX || it.name[0].isUpperCase() }.recursively()
}.toCamelCase()

// take all frame columns recursively and call renameToCamelCase() on all dataframes inside
.update {
colsOf<AnyFrame>().recursively()
}.with { it.renameToCamelCase() }

// convert all first chars of all columns to the lowercase
.rename {
cols { !it.isColumnGroup() }.recursively()
}.into {
it.name.replaceFirstChar { it.lowercaseChar() }
}

public fun <T, C> RenameClause<T, C>.into(vararg newColumns: ColumnReference<*>): DataFrame<T> =
into(*newColumns.map { it.name() }.toTypedArray())

public fun <T, C> RenameClause<T, C>.into(vararg newNames: String): DataFrame<T> =
df.move(columns).intoIndexed { col, index ->
col.path.dropLast(1) + newNames[index]
}
renameImpl(newNames)

public fun <T, C> RenameClause<T, C>.into(vararg newNames: KProperty<*>): DataFrame<T> =
into(*newNames.map { it.name }.toTypedArray())

public fun <T, C> RenameClause<T, C>.into(transform: (ColumnWithPath<C>) -> String): DataFrame<T> =
df.move(columns).into {
it.path.dropLast(1) + transform(it)
}

public fun <T, C> RenameClause<T, C>.toCamelCase(): DataFrame<T> =
into { it.name().toCamelCaseByDelimiters(DELIMITERS_REGEX) }
renameImpl(transform)

/**
* ## Rename to camelCase
*
* Renames the selected columns to `camelCase` by replacing all [delimiters][DELIMITERS_REGEX]
* and converting the first char to lowercase.
*/
public fun <T, C> RenameClause<T, C>.toCamelCase(): DataFrame<T> = into {
it.name()
.toCamelCaseByDelimiters(DELIMITERS_REGEX)
.replaceFirstChar { it.lowercaseChar() }
}

// endregion

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ public fun <T, C> ReplaceClause<T, C>.with(newColumns: List<AnyCol>): DataFrame<
}
}

/* TODO: Issue #418: breaks if running on ColumnGroup and its child */
public fun <T, C> ReplaceClause<T, C>.with(transform: ColumnsContainer<T>.(DataColumn<C>) -> AnyBaseCol): DataFrame<T> {
val removeResult = df.removeImpl(columns = columns)
val toInsert = removeResult.removedColumns.map {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
package org.jetbrains.kotlinx.dataframe.impl.api

import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.api.RenameClause
import org.jetbrains.kotlinx.dataframe.api.cast
import org.jetbrains.kotlinx.dataframe.api.getColumnsWithPaths
import org.jetbrains.kotlinx.dataframe.api.insert
import org.jetbrains.kotlinx.dataframe.api.under
import org.jetbrains.kotlinx.dataframe.columns.ColumnKind
import org.jetbrains.kotlinx.dataframe.columns.ColumnWithPath
import org.jetbrains.kotlinx.dataframe.impl.columns.tree.allChildrenNotNull
import org.jetbrains.kotlinx.dataframe.impl.columns.tree.collectTree
import org.jetbrains.kotlinx.dataframe.kind

internal fun <T, C> RenameClause<T, C>.renameImpl(newNames: Array<out String>): DataFrame<T> {
var i = 0
return renameImpl { newNames[i++] }
}

internal fun <T, C> RenameClause<T, C>.renameImpl(transform: (ColumnWithPath<C>) -> String): DataFrame<T> {
val selectedColumns = df.getColumnsWithPaths(columns)
val tree = df.getColumnsWithPaths { all().rec() }.collectTree()

// perform rename in nodes
tree.allChildrenNotNull().forEach { node ->
val column = selectedColumns.find { it.data == node.data } ?: return@forEach
Copy link
Collaborator

@koperagen koperagen Jun 29, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's data here? From collectTree definition it seems its a DataColumn, and in this case equals will iterate over elements of the column in worst case. Quite a heavy operation, although probably it shouldn't happen in this algorithm. Can you double check and maybe provide a comment with justification?

Copy link
Collaborator

@koperagen koperagen Jun 29, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is it possible that there's two identical columns under different column groups and second one will never be renamed?

edit. looks like no, because you iterate over all tree nodes

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

data is a column with no path anymore (collectTree() removes it and ColumnWithPath.data yields the original column). So to get that path back I need to find it in selectedColumns (I'll change this to a Map actually). I would like to use node.pathFromRoot() but unfortunately, the tree might already have had some node renamed, so this would generate wrong results.

val newName = transform(column)
node.name = newName
}

// build up a new DataFrame using the modified names
var newDf = DataFrame.empty(df.rowsCount()).cast<T>()
tree.allChildrenNotNull().forEach { node ->
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This tree structure is used in other places to create dataframes? Is there an existing code to re-create a df from it? Approach with insert under looks suspicious because it's a "mutation" of an immutable data structure in a loop

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There doesn't appear to be such a thing. Usually dataframes are built from the ground up with columns recursively. I'll try to create a sort-of mapping function for TreeNode with which I can create a new dataframe with columns I create on the fly

val path = node.pathFromRoot().dropLast(1)
val col = node.data.rename(node.name)

when (col.kind) {
ColumnKind.Value, ColumnKind.Frame ->
newDf = newDf.insert(col).under(path)

ColumnKind.Group -> Unit
}
}

return newDf
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ package org.jetbrains.kotlinx.dataframe.impl.columns.tree
import org.jetbrains.kotlinx.dataframe.columns.ColumnPath

internal class TreeNode<T>(
override val name: String,
override var name: String,
override val depth: Int,
override var data: T,
override val parent: TreeNode<T>? = null,
Expand Down
Loading