r-lib · MichaelChirico · Nov 19, 2023 · Nov 17, 2023 · Nov 17, 2023 · Nov 18, 2023
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -138,6 +138,7 @@ Collate:
     'namespace_linter.R'
     'nested_ifelse_linter.R'
     'nonportable_path_linter.R'
+    'nrow_subset_linter.R'
     'numeric_leading_zero_linter.R'
     'object_length_linter.R'
     'object_name_linter.R'

diff --git a/NAMESPACE b/NAMESPACE
@@ -100,6 +100,7 @@ export(namespace_linter)
 export(nested_ifelse_linter)
 export(no_tab_linter)
 export(nonportable_path_linter)
+export(nrow_subset_linter)
 export(numeric_leading_zero_linter)
 export(object_length_linter)
 export(object_name_linter)

diff --git a/NEWS.md b/NEWS.md
@@ -15,6 +15,7 @@
 * `comparison_negation_linter()` for discouraging negated comparisons when a direct negation is preferable, e.g. `!(x == y)` could be `x != y` (part of #884, @MichaelChirico).
 * `terminal_close_linter()` for discouraging using `close()` to end functions (part of #884, @MichaelChirico). Such usages are not robust to errors, where `close()` will not be run as intended. Put `close()` in an `on.exit()` hook, or use {withr} to manage connections with proper cleanup.
 * `print_linter()` for discouraging usage of `print()` on string literals like `print("Reached here")` or `print(paste("Found", nrow(DF), "rows."))` (#1894, @MichaelChirico).
+* `nrow_subset_linter()` for discouraging usage like `nrow(subset(x, conditions))` in favor of something like `with(x, sum(conditions))` which doesn't require a full subset of `x` (part of #884, @MichaelChirico).
 
 ### Lint accuracy fixes: removing false positives
 

diff --git a/R/nrow_subset_linter.R b/R/nrow_subset_linter.R
@@ -0,0 +1,43 @@
+#' Block usage of `nrow(subset(x, .))`
+#'
+#' Using `nrow(subset(x, condition))` to count the instances where `condition`
+#'   applies inefficiently requires doing a full subset of `x` just to
+#'   count the number of rows in the resulting subset.
+#' There are a number of equivalent expressions that don't require the full
+#'   subset, e.g. `with(x, sum(condition))` (or, more generically,
+#'   `with(x, sum(condition, na.rm = TRUE))`).
+#' The same can be said of other versions of this like
+#'   `nrow(DT[(condition)])` for subsetting a `data.table` or
+#'   `DT %>% filter(condition) %>% nrow()`.
+#'
+#' @examples
+#' # will produce lints
+#' lint(
+#'   text = "nrow(subset(x, is_treatment))",
+#'   linters = nrow_subset_linter()
+#' )
+#'
+#' # okay
+#' lint(
+#'   text = "with(x, sum(is_treatment, na.rm = TRUE))",
+#'   linters = nrow_subset_linter()
+#' )
+#'
+#' @evalRd rd_tags("nrow_subset_linter")
+#' @seealso [linters] for a complete list of linters available in lintr.
+#' @export
+nrow_subset_linter <- make_linter_from_xpath(
+  xpath = "
+  //SYMBOL_FUNCTION_CALL[text() = 'subset']
+    /parent::expr
+    /parent::expr
+    /parent::expr[expr/SYMBOL_FUNCTION_CALL[text() = 'nrow']]
+  ",
+  lint_message = paste(
+    "Use arithmetic to count the number of rows satisfying a condition,",
+    "rather than fully subsetting the table and counting the resulting rows.",
+    "For example, replace nrow(subset(x, is_treatment))",
+    "with sum(x$is_treatment). NB: use na.rm = TRUE if `is_treatment` has",
+    "missing values."
+  )
+)
diff --git a/inst/lintr/linters.csv b/inst/lintr/linters.csv
@@ -57,6 +57,7 @@ namespace_linter,correctness robustness configurable executing
 nested_ifelse_linter,efficiency readability
 no_tab_linter,style consistency deprecated
 nonportable_path_linter,robustness best_practices configurable
+nrow_subset_linter,efficiency consistency readability best_practices
 numeric_leading_zero_linter,style consistency readability
 object_length_linter,style readability default configurable executing
 object_name_linter,style consistency default configurable executing

diff --git a/man/best_practices_linters.Rd b/man/best_practices_linters.Rd
diff --git a/man/consistency_linters.Rd b/man/consistency_linters.Rd
diff --git a/man/efficiency_linters.Rd b/man/efficiency_linters.Rd
diff --git a/man/linters.Rd b/man/linters.Rd
diff --git a/man/nrow_subset_linter.Rd b/man/nrow_subset_linter.Rd
diff --git a/man/readability_linters.Rd b/man/readability_linters.Rd
diff --git a/tests/testthat/test-nrow_subset_linter.R b/tests/testthat/test-nrow_subset_linter.R
@@ -0,0 +1,56 @@
+# TODO(michaelchirico): activate this false positive test when below cases are done.
+# test_that("nrow_subset_linter skips allowed usages", {
+#   # nrow can be avoided here (by chaining the expression and using .N),
+#   #   but the benefit of doing so is not the same as in the other cases.
+#   lintr::expect_lint(
+#     "nrow(DT[x == y, 1, by = grp])",
+#     NULL,
+#     nrow_subset_linter
+#   )
+# })
+
+test_that("nrow_subset_linter blocks subset() cases", {
+  expect_lint(
+    "nrow(subset(x, y == z))",
+    rex::rex("Use arithmetic to count the number of rows satisfying a condition"),
+    nrow_subset_linter()
+  )
+
+  # TODO(michaelchirico): implement this.
+  # lintr::expect_lint(
+  #   "x %>% subset(y == z) %>% nrow()",
+  #   "Use arithmetic to count the number of rows satisfying a condition",
+  #   nrow_subset_linter
+  # )
+})
+
+# TODO(michaelchirico): implement these.
+# test_that("nrow_subset_linter blocks [ cases", {
+#   # data.frame subsetting (NB: replacement doesn't use na.rm = TRUE)
+#   lintr::expect_lint(
+#     "nrow(x[x$y == x$z, ])",
+#     "Use arithmetic to count the number of rows satisfying a condition",
+#     nrow_subset_linter
+#   )
+
+#   # data.table subsetting (NB: replacement needs na.rm = TRUE)
+#   lintr::expect_lint(
+#     "x[y == z, ]",
+#     "Use arithmetic to count the number of rows satisfying a condition",
+#     nrow_subset_linter
+#   )
+# })
+
+# test_that("nrow_subset_linter blocks dplyr::filter() cases", {
+#   lintr::expect_lint(
+#     "x %>% filter(y == z) %>% nrow()",
+#     "Use arithmetic to count the number of rows satisfying a condition",
+#     nrow_subset_linter
+#   )
+
+#   lintr::expect_lint(
+#     "nrow(dplyr::filter(x, y == z))",
+#     "Use arithmetic to count the number of rows satisfying a condition",
+#     nrow_subset_linter
+#   )
+# })