Merge pull request #86 from stemangiola/speedup-aggregate_cells

stemangiola · web-flow · commit f38d19038e29 · 2023-08-25T21:33:23.000+10:00
Speedup aggregate cells
diff --git a/NAMESPACE b/NAMESPACE
@@ -49,6 +49,7 @@ importFrom(Matrix,rowSums)
 importFrom(S4Vectors,"metadata<-")
 importFrom(S4Vectors,DataFrame)
 importFrom(S4Vectors,metadata)
+importFrom(S4Vectors,split)
 importFrom(SingleCellExperiment,cbind)
 importFrom(SingleCellExperiment,reducedDims)
 importFrom(SummarizedExperiment,"assays<-")
@@ -69,6 +70,7 @@ importFrom(dplyr,filter)
 importFrom(dplyr,full_join)
 importFrom(dplyr,group_by)
 importFrom(dplyr,group_by_drop_default)
+importFrom(dplyr,group_split)
 importFrom(dplyr,inner_join)
 importFrom(dplyr,left_join)
 importFrom(dplyr,mutate)
@@ -123,6 +125,7 @@ importFrom(rlang,quo_squash)
 importFrom(stats,setNames)
 importFrom(stringr,regex)
 importFrom(stringr,str_detect)
+importFrom(stringr,str_remove)
 importFrom(stringr,str_replace_all)
 importFrom(tibble,as_tibble)
 importFrom(tibble,enframe)
diff --git a/R/methods.R b/R/methods.R
@@ -39,7 +39,7 @@ setMethod("join_features", "SingleCellExperiment", function(.data,
     # CRAN Note
     .cell <- NULL
     .feature <- NULL
-    
+
     # Shape is long
     if (shape == "long") {
         .data %>%
@@ -50,11 +50,11 @@ setMethod("join_features", "SingleCellExperiment", function(.data,
                 features=features,
                 all=all,
                 exclude_zeros=exclude_zeros)) %>%
-        select(!!c_(.data)$symbol, .feature, 
+        select(!!c_(.data)$symbol, .feature,
             contains(".abundance"), everything())
     # Shape if wide
     } else {
-        .data  %>% 
+        .data  %>%
             left_join(
                 by=c_(.data)$name,
                 get_abundance_sc_wide(
@@ -84,7 +84,7 @@ tidy <- function(object) {
 #' @importFrom lifecycle deprecate_warn
 #' @export
 tidy.SingleCellExperiment <- function(object) {
-    
+
     # DEPRECATE
     deprecate_warn(
         when="1.1.1",
@@ -98,8 +98,8 @@ tidy.SingleCellExperiment <- function(object) {
 #' @rdname aggregate_cells
 #' @inherit ttservice::aggregate_cells
 #' @aliases aggregate_cells,SingleCellExperiment-method
-#' 
-#' @examples 
+#'
+#' @examples
 #' data(pbmc_small)
 #' pbmc_small_pseudo_bulk <- pbmc_small |>
 #'   aggregate_cells(c(groups, ident), assays="counts")
@@ -110,48 +110,88 @@ tidy.SingleCellExperiment <- function(object) {
 #' @importFrom Matrix rowSums
 #' @importFrom ttservice aggregate_cells
 #' @importFrom SummarizedExperiment assays assays<- assayNames
+#' @importFrom S4Vectors split
+#' @importFrom stringr str_remove
+#' @importFrom dplyr group_split
+#'
+#'
 #' @export
 setMethod("aggregate_cells", "SingleCellExperiment", function(.data,
-    .sample=NULL, slot="data", assays=NULL, 
+    .sample=NULL, slot="data", assays=NULL,
     aggregation_function=Matrix::rowSums,
     ...) {
-    
+
     # Fix NOTEs
     feature <- NULL
     .sample <- enquo(.sample)
-    
+
     # Subset only wanted assays
     if (!is.null(assays)) {
         assays(.data) <- assays(.data)[assays]
     }
-    
-    .data %>%
-        nest(data=-!!.sample) %>%
-        mutate(.aggregated_cells=as.integer(map(data, ~ ncol(.x)))) %>% 
-        mutate(
-            data=map(data, ~ {
-                # Loop over assays
-                map2(as.list(assays(.x)), assayNames(.x), ~ {
-                    # Get counts
-                    .x %>%
-                        aggregation_function(na.rm=TRUE) %>%
-                        enframe(
-                            name ="feature",
-                            value=sprintf("%s", .y)) %>%
-                        mutate(feature=as.character(feature))
-                }) %>% 
-                Reduce(function(...) full_join(..., by="feature"), .)
-            })
-        ) %>%
+
+
+    grouping_factor =
+      .data |>
+      colData() |>
+      as_tibble() |>
+      select(!!.sample) |>
+      suppressMessages() |>
+      unite("my_id_to_split_by___", !!.sample, sep = "___") |>
+      pull(my_id_to_split_by___) |>
+      as.factor()
+
+    list_count_cells = table(grouping_factor) |> as.list()
+
+    # New method
+    list_assays =
+      .data |>
+      assays() |>
+      as.list() |>
+      map(~ .x |> splitColData(grouping_factor)) |>
+      unlist(recursive=FALSE)
+
+    list_assays =
+      list_assays |>
+      map2(names(list_assays), ~ {
+        # Get counts
+        .x %>%
+          aggregation_function(na.rm=TRUE) %>%
+          enframe(
+            name =".feature",
+            value="x") %>% # sprintf("%s", .y)) %>%
+
+          # In case we don't have rownames
+          mutate(.feature=as.character(.feature))
+      }) |>
+      enframe(name = ".sample") |>
+
+      # Clean groups
+      mutate(assay_name = assayNames(!!.data) |> rep(each=length(levels(grouping_factor)))) |>
+      mutate(.sample = .sample |> str_remove(assay_name) |> str_remove("\\.")) |>
+      group_split(.sample) |>
+      map(~ .x |>  unnest(value) |> pivot_wider(names_from = assay_name, values_from = x) ) |>
+
+      # Add cell count
+      map2(
+        list_count_cells,
+        ~ .x |> mutate(.aggregated_cells = .y)
+      )
+
+
+    do.call(rbind, list_assays) |>
+
         left_join(
-            .data %>% 
-                as_tibble() %>% 
-                subset(!!.sample), 
-            by=quo_names(.sample)) %>%
-        unnest(data) %>%
-        drop_class("tidySingleCellExperiment_nested") %>%
+          .data |>
+            colData() |>
+            as_tibble() |>
+            subset(!!.sample) |>
+            unite("my_id_to_split_by___", !!.sample, remove=FALSE, sep = "___"),
+            by= join_by(".sample" == "my_id_to_split_by___")
+        ) |>
+
         as_SummarizedExperiment(
-            .sample=!!.sample, 
-            .transcript=feature, 
+            .sample=.sample,
+            .transcript=.feature,
             .abundance=!!as.symbol(names(.data@assays)))
 })
diff --git a/R/utilities.R b/R/utilities.R
@@ -365,7 +365,6 @@ get_special_column_name_cell <- function(name) {
     list(name=name, symbol=as.symbol(name))
 }
 
-cell__ <- get_special_column_name_symbol(".cell")
 
 #' @importFrom S4Vectors metadata
 c_ <- function(x) {
@@ -442,12 +441,17 @@ get_specific_annotation_columns <- function(.data, .col) {
     # x-annotation df
     n_x <- .data |> distinct_at(vars(!!.col)) |> nrow()
     
+    # Exclude columns that have more values than my .col
+    columns_unique_length = .data |> select(-!!.col) |> lapply(function(x) unique(x) |> length())
+    columns_unique_length = columns_unique_length[columns_unique_length<=n_x]
+    
+    .sample = .data |> select(!!.col) |> unite(".sample", !!.col) |> pull(.sample)
+    
     # element wise columns
-    .data |>
-        select(-!!.col) |>
-        colnames() |>
+    columns_unique_length |>
+      names() |> 
         map(~ {
-            n_.x <- .data |> distinct_at(vars(!!.col, .x)) |> nrow()
+            n_.x <- .data |> pull(all_of(.x)) |> paste(.sample)  |> unique() |> length()
             if (n_.x == n_x) .x else NULL
         }) %>%
         # Drop NULL
@@ -482,5 +486,23 @@ subset <- function(.data, .column)	{
         distinct()
 }
 
+
+splitColData <- function(x, f) {
+  # This is by @jma1991 
+  # at https://github.com/drisso/SingleCellExperiment/issues/55
+  
+  i <- split(seq_along(f), f)
+  
+  v <- vector(mode = "list", length = length(i))
+  
+  names(v) <- names(i)
+  
+  for (n in names(i)) { v[[n]] <- x[, i[[n]]] }
+  
+  return(v)
+  
+}
+
+cell__ <- get_special_column_name_symbol(".cell")
 feature__ <- get_special_column_name_symbol(".feature")
 sample__ <- get_special_column_name_symbol(".sample")