Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# dplyr (development version)

* `n()` is now a little faster when there are many groups (#6727).

* `if_else()` and `case_when()` again accept logical conditions that have
attributes (#6678).

Expand Down
2 changes: 1 addition & 1 deletion R/conditions.R
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ or_1 <- function(x) {
}

has_active_group_context <- function(mask) {
mask$get_current_group() != 0L
mask$get_current_group_id_mutable() != 0L
}

# Common ------------------------------------------------------------------
Expand Down
7 changes: 2 additions & 5 deletions R/context.R
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ NULL
#' @rdname context
#' @export
n <- function() {
length(peek_mask()$current_rows())
peek_mask()$get_current_group_size()
}

#' @rdname context
Expand All @@ -60,10 +60,7 @@ cur_group <- function() {
#' @rdname context
#' @export
cur_group_id <- function() {
# [] to get a copy because the current group is dealt with internally
# if we don't get a copy, code like this won't give correct result:
# summarise(id = cur_group_id())
peek_mask()$get_current_group()[]
peek_mask()$get_current_group_id()
}

#' @rdname context
Expand Down
77 changes: 64 additions & 13 deletions R/data-mask.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,25 @@ DataMask <- R6Class("DataMask",
private$grouped <- by$type == "grouped"
private$rowwise <- by$type == "rowwise"

private$chops <- .Call(dplyr_lazy_vec_chop_impl, data, rows, private$grouped, private$rowwise)
private$env_mask_bindings <- .Call(dplyr_make_mask_bindings, private$chops, data)
private$env_current_group_info <- new_environment(data = list(
`dplyr:::current_group_id` = 0L,
`dplyr:::current_group_size` = 0L
))

private$chops <- .Call(
dplyr_lazy_vec_chop_impl,
data,
rows,
private$env_current_group_info,
private$grouped,
private$rowwise
)

private$env_mask_bindings <- .Call(
dplyr_make_mask_bindings,
private$chops,
data
)

private$keys <- group_keys0(by$data)
private$by_names <- by$names
Expand Down Expand Up @@ -88,16 +105,15 @@ DataMask <- R6Class("DataMask",
})
}

size <- length(self$current_rows())
dplyr_new_tibble(cols, size = size)
dplyr_new_tibble(cols, size = self$get_current_group_size_mutable())
},

current_cols = function(vars) {
env_get_list(private$env_mask_bindings, vars)
},

current_rows = function() {
private$rows[[self$get_current_group()]]
private$rows[[self$get_current_group_id_mutable()]]
},

current_key = function() {
Expand All @@ -109,7 +125,7 @@ DataMask <- R6Class("DataMask",
# to do `vec_slice(<0-row-df>, 1L)`, which is an error.
keys
} else {
vec_slice(keys, self$get_current_group())
vec_slice(keys, self$get_current_group_id_mutable())
}
},

Expand All @@ -121,12 +137,44 @@ DataMask <- R6Class("DataMask",
setdiff(self$current_vars(), private$by_names)
},

get_current_group = function() {
parent.env(private$chops)$.current_group
# This pair of functions provides access to `dplyr:::current_group_id`.
# - `dplyr:::current_group_id` is modified by reference at the C level.
# - If you access it ephemerally, the mutable version can be used.
# - If you access it persistently, like in `cur_group_id()`, it must be
# duplicated on the way out.
# - For maximal performance, we inline the mutable function definition into
# the non-mutable version.
get_current_group_id = function() {
duplicate(private[["env_current_group_info"]][["dplyr:::current_group_id"]])
},
get_current_group_id_mutable = function() {
private[["env_current_group_info"]][["dplyr:::current_group_id"]]
},

# This pair of functions provides access to `dplyr:::current_group_size`.
# - `dplyr:::current_group_size` is modified by reference at the C level.
# - If you access it ephemerally, the mutable version can be used.
# - If you access it persistently, like in `n()`, it must be duplicated on
# the way out.
# - For maximal performance, we inline the mutable function definition into
# the non-mutable version.
get_current_group_size = function() {
duplicate(private[["env_current_group_info"]][["dplyr:::current_group_size"]])
},
get_current_group_size_mutable = function() {
private[["env_current_group_info"]][["dplyr:::current_group_size"]]
},

set_current_group = function(group) {
parent.env(private$chops)$.current_group[] <- group
# Only to be used right before throwing an error.
# We `duplicate()` group to be extremely conservative, because there is an
# extremely small chance we could modify this by reference and cause
# issues with the `group` variable in the caller, but this has never been
# seen. `length()` always returns a fresh variable so we don't duplicate
# in that case.
env_current_group_info <- private[["env_current_group_info"]]
env_current_group_info[["dplyr:::current_group_id"]] <- duplicate(group)
env_current_group_info[["dplyr:::current_group_size"]] <- length(private$rows[[group]])
},

get_used = function() {
Expand Down Expand Up @@ -203,12 +251,15 @@ DataMask <- R6Class("DataMask",
private = list(
# environment that contains lazy vec_chop()s for each input column
# and list of result chunks as they get added.
#
# The parent environment of chops has:
# - .indices: the list of indices
# - .current_group: scalar integer that identifies the current group
chops = NULL,

# Environment which contains the:
# - Current group id
# - Current group size
# Both of which are updated by reference at the C level.
# This environment is the parent environment of `chops`.
env_current_group_info = NULL,

# Environment with active bindings for each column.
# Expressions are evaluated in a fresh data mask created from this
# environment. Each group gets its own newly created data mask to avoid
Expand Down
30 changes: 16 additions & 14 deletions src/chop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,31 +66,33 @@ void dplyr_lazy_vec_chop_ungrouped(SEXP chops_env, SEXP data) {
UNPROTECT(1);
}

SEXP dplyr_lazy_vec_chop(SEXP data, SEXP rows, SEXP ffi_grouped, SEXP ffi_rowwise) {
SEXP dplyr_lazy_vec_chop(SEXP data,
SEXP rows,
SEXP env_current_group_info,
SEXP ffi_grouped,
SEXP ffi_rowwise) {
bool grouped = static_cast<bool>(LOGICAL_ELT(ffi_grouped, 0));
bool rowwise = static_cast<bool>(LOGICAL_ELT(ffi_rowwise, 0));

// a first environment to hide `.indices` and `.current_group`
// this is for example used by funs::
SEXP indices_env = PROTECT(new_environment(2, R_EmptyEnv));
Rf_defineVar(dplyr::symbols::dot_indices, rows, indices_env);
Rf_defineVar(dplyr::symbols::dot_current_group, Rf_ScalarInteger(0), indices_env);
// An environment to hold the chops of the columns.
// Parent environment contains information about current group id
// and current group size, for use in mask binding evaluation.
SEXP env_chops = PROTECT(new_environment(XLENGTH(data), env_current_group_info));

// then an environment to hold the chops of the columns
SEXP chops_env = PROTECT(new_environment(XLENGTH(data), indices_env));
if (grouped) {
dplyr_lazy_vec_chop_grouped(chops_env, rows, data, false);
dplyr_lazy_vec_chop_grouped(env_chops, rows, data, false);
} else if (rowwise) {
dplyr_lazy_vec_chop_grouped(chops_env, rows, data, true);
dplyr_lazy_vec_chop_grouped(env_chops, rows, data, true);
} else {
dplyr_lazy_vec_chop_ungrouped(chops_env, data);
dplyr_lazy_vec_chop_ungrouped(env_chops, data);
}
UNPROTECT(2);
return chops_env;

UNPROTECT(1);
return env_chops;
}

void add_mask_binding(SEXP name, SEXP env_mask_bindings, SEXP env_chops) {
SEXP body = PROTECT(Rf_lang3(dplyr::functions::dot_subset2, name, dplyr::symbols::dot_current_group));
SEXP body = PROTECT(Rf_lang3(dplyr::functions::dot_subset2, name, dplyr::symbols::current_group_id));
SEXP fun = PROTECT(Rf_lang3(dplyr::functions::function, R_NilValue, body));
SEXP binding = PROTECT(Rf_eval(fun, env_chops));
R_MakeActiveBinding(name, binding, env_mask_bindings);
Expand Down
39 changes: 23 additions & 16 deletions src/dplyr.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,21 +37,22 @@ struct symbols {
static SEXP groups;
static SEXP levels;
static SEXP ptype;
static SEXP dot_current_group;
static SEXP current_group_id;
static SEXP current_group_size;
static SEXP current_expression;
static SEXP rows;
static SEXP caller;
static SEXP current_data;
static SEXP dot_drop;
static SEXP dplyr_internal_error;
static SEXP dplyr_internal_signal;
static SEXP dot_indices;
static SEXP chops;
static SEXP vec_is_list;
static SEXP new_env;
static SEXP dot_data;
static SEXP used;
static SEXP across;
static SEXP env_current_group_info;
static SEXP env_mask_bindings;
};

Expand Down Expand Up @@ -112,27 +113,32 @@ SEXP dplyr_group_keys(SEXP group_data);
SEXP dplyr_mask_binding_remove(SEXP env_private, SEXP s_name);
SEXP dplyr_mask_binding_add(SEXP env_private, SEXP s_name, SEXP ptype, SEXP chunks);

SEXP dplyr_lazy_vec_chop(SEXP data, SEXP rows, SEXP ffi_grouped, SEXP ffi_rowwise);
SEXP dplyr_lazy_vec_chop(SEXP data, SEXP rows, SEXP env_current_group_info, SEXP ffi_grouped, SEXP ffi_rowwise);
SEXP dplyr_make_mask_bindings(SEXP chops, SEXP data);
SEXP env_resolved(SEXP env, SEXP names);
void add_mask_binding(SEXP name, SEXP env_mask_bindings, SEXP env_chops);

SEXP dplyr_extract_chunks(SEXP df_list, SEXP df_ptype);

#define DPLYR_MASK_INIT() \
SEXP rows = PROTECT(Rf_findVarInFrame(env_private, dplyr::symbols::rows)); \
R_xlen_t ngroups = XLENGTH(rows); \
SEXP caller = PROTECT(Rf_findVarInFrame(env_private, dplyr::symbols::caller)); \
SEXP env_mask_bindings = PROTECT(Rf_findVarInFrame(env_private, dplyr::symbols::env_mask_bindings)); \
SEXP pronoun = PROTECT(rlang::as_data_pronoun(env_mask_bindings)); \
SEXP chops_env = PROTECT(Rf_findVarInFrame(env_private, dplyr::symbols::chops)); \
SEXP current_group = PROTECT(Rf_findVarInFrame(ENCLOS(chops_env), dplyr::symbols::dot_current_group)); \
int* p_current_group = INTEGER(current_group); \
*p_current_group = 0
#define DPLYR_MASK_INIT() \
SEXP rows = PROTECT(Rf_findVarInFrame(env_private, dplyr::symbols::rows)); \
const SEXP* v_rows = VECTOR_PTR_RO(rows); \
R_xlen_t ngroups = XLENGTH(rows); \
SEXP caller = PROTECT(Rf_findVarInFrame(env_private, dplyr::symbols::caller)); \
SEXP env_mask_bindings = PROTECT(Rf_findVarInFrame(env_private, dplyr::symbols::env_mask_bindings)); \
SEXP pronoun = PROTECT(rlang::as_data_pronoun(env_mask_bindings)); \
SEXP env_current_group_info = PROTECT(Rf_findVarInFrame(env_private, dplyr::symbols::env_current_group_info)); \
SEXP current_group_id = PROTECT(Rf_findVarInFrame(env_current_group_info, dplyr::symbols::current_group_id)); \
int* p_current_group_id = INTEGER(current_group_id); \
*p_current_group_id = 0; \
SEXP current_group_size = PROTECT(Rf_findVarInFrame(env_current_group_info, dplyr::symbols::current_group_size)); \
int* p_current_group_size = INTEGER(current_group_size); \
*p_current_group_size = 0

#define DPLYR_MASK_FINALISE() \
UNPROTECT(6); \
*p_current_group = 0
UNPROTECT(7); \
*p_current_group_id = 0; \
*p_current_group_size = 0

// At each iteration, we create a fresh data mask so that lexical side effects,
// such as using `<-` in a `mutate()`, don't persist between groups
Expand All @@ -144,7 +150,8 @@ SEXP dplyr_extract_chunks(SEXP df_list, SEXP df_ptype);
UNPROTECT(1)

#define DPLYR_MASK_SET_GROUP(INDEX) \
*p_current_group = INDEX + 1
*p_current_group_id = INDEX + 1; \
*p_current_group_size = Rf_xlength(v_rows[INDEX])

#define DPLYR_MASK_EVAL(quo) \
rlang::eval_tidy(quo, mask, caller)
Expand Down
7 changes: 4 additions & 3 deletions src/init.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,21 +42,22 @@ SEXP get_names_summarise_recycle_chunks(){
SEXP symbols::ptype = Rf_install("ptype");
SEXP symbols::levels = Rf_install("levels");
SEXP symbols::groups = Rf_install("groups");
SEXP symbols::dot_current_group = Rf_install(".current_group");
SEXP symbols::current_group_id = Rf_install("dplyr:::current_group_id");
SEXP symbols::current_group_size = Rf_install("dplyr:::current_group_size");
SEXP symbols::current_expression = Rf_install("current_expression");
SEXP symbols::rows = Rf_install("rows");
SEXP symbols::caller = Rf_install("caller");
SEXP symbols::current_data = Rf_install("current_data");
SEXP symbols::dot_drop = Rf_install(".drop");
SEXP symbols::dplyr_internal_error = Rf_install("dplyr_internal_error");
SEXP symbols::dplyr_internal_signal = Rf_install("dplyr_internal_signal");
SEXP symbols::dot_indices = Rf_install(".indices");
SEXP symbols::chops = Rf_install("chops");
SEXP symbols::vec_is_list = Rf_install("vec_is_list");
SEXP symbols::new_env = Rf_install("new.env");
SEXP symbols::dot_data = Rf_install(".data");
SEXP symbols::used = Rf_install("used");
SEXP symbols::across = Rf_install("across");
SEXP symbols::env_current_group_info = Rf_install("env_current_group_info");
SEXP symbols::env_mask_bindings = Rf_install("env_mask_bindings");

SEXP vectors::classes_vctrs_list_of = get_classes_vctrs_list_of();
Expand Down Expand Up @@ -114,7 +115,7 @@ static const R_CallMethodDef CallEntries[] = {
{"dplyr_mask_binding_remove", (DL_FUNC)& dplyr_mask_binding_remove, 2},
{"dplyr_mask_binding_add", (DL_FUNC)& dplyr_mask_binding_add, 4},

{"dplyr_lazy_vec_chop_impl", (DL_FUNC)& dplyr_lazy_vec_chop, 4},
{"dplyr_lazy_vec_chop_impl", (DL_FUNC)& dplyr_lazy_vec_chop, 5},
{"dplyr_make_mask_bindings", (DL_FUNC)& dplyr_make_mask_bindings, 2},
{"env_resolved", (DL_FUNC)& env_resolved, 2},

Expand Down