From ad431dc56964e47004b1520193a26b7ce7bcbd1f Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Fri, 21 Jan 2022 12:14:32 -0600 Subject: [PATCH] Optionally extract a group with str_extract() Fixes #420 --- NEWS.md | 3 +++ R/extract.r | 13 +++++++++++-- man/str_extract.Rd | 9 ++++++++- tests/testthat/test-extract.r | 5 +++++ 4 files changed, 27 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index 7afaf228..faf2e270 100644 --- a/NEWS.md +++ b/NEWS.md @@ -39,6 +39,9 @@ * New `str_equal()` compares two character vectors using unicode rules, and optionally ignores case (#381). + +* `str_extract()` can now optionally extract a capturing group instead of + the complete match (#420). * New `str_split_1()` is tailored for the special case of splitting up a single string (#409). diff --git a/R/extract.r b/R/extract.r index 02e92965..18892a86 100644 --- a/R/extract.r +++ b/R/extract.r @@ -4,6 +4,8 @@ #' #' @inheritParams str_detect #' @return A character vector. +#' @param group If supplied, instead of returning the complete match, will +#' return the matched text from the specified capturing group. #' @seealso [str_match()] to extract matched groups; #' [stringi::stri_extract()] for the underlying implementation. #' @param simplify If `FALSE`, the default, returns a list of character @@ -16,6 +18,10 @@ #' str_extract(shopping_list, "[a-z]{1,4}") #' str_extract(shopping_list, "\\b[a-z]{1,4}\\b") #' +#' str_extract(shopping_list, "([a-z]+) of ([a-z]+)") +#' str_extract(shopping_list, "([a-z]+) of ([a-z]+)", group = 1) +#' str_extract(shopping_list, "([a-z]+) of ([a-z]+)", group = 2) +#' #' # Extract all matches #' str_extract_all(shopping_list, "[a-z]+") #' str_extract_all(shopping_list, "\\b[a-z]+\\b") @@ -27,9 +33,12 @@ #' #' # Extract all words #' str_extract_all("This is, suprisingly, a sentence.", boundary("word")) -str_extract <- function(string, pattern) { - check_lengths(string, pattern) +str_extract <- function(string, pattern, group = NULL) { + if (!is.null(group)) { + return(str_match(string, pattern)[, group + 1]) + } + check_lengths(string, pattern) switch(type(pattern), empty = stri_extract_first_boundaries(string, pattern, opts_brkiter = opts(pattern)), bound = stri_extract_first_boundaries(string, pattern, opts_brkiter = opts(pattern)), diff --git a/man/str_extract.Rd b/man/str_extract.Rd index 9861f269..32a7cce0 100644 --- a/man/str_extract.Rd +++ b/man/str_extract.Rd @@ -5,7 +5,7 @@ \alias{str_extract_all} \title{Extract matching patterns from a string} \usage{ -str_extract(string, pattern) +str_extract(string, pattern, group = NULL) str_extract_all(string, pattern, simplify = FALSE) } @@ -27,6 +27,9 @@ Match character, word, line and sentence boundaries with \code{\link[=boundary]{boundary()}}. An empty pattern, "", is equivalent to \code{boundary("character")}.} +\item{group}{If supplied, instead of returning the complete match, will +return the matched text from the specified capturing group.} + \item{simplify}{If \code{FALSE}, the default, returns a list of character vectors. If \code{TRUE} returns a character matrix.} } @@ -43,6 +46,10 @@ str_extract(shopping_list, "[a-z]+") str_extract(shopping_list, "[a-z]{1,4}") str_extract(shopping_list, "\\\\b[a-z]{1,4}\\\\b") +str_extract(shopping_list, "([a-z]+) of ([a-z]+)") +str_extract(shopping_list, "([a-z]+) of ([a-z]+)", group = 1) +str_extract(shopping_list, "([a-z]+) of ([a-z]+)", group = 2) + # Extract all matches str_extract_all(shopping_list, "[a-z]+") str_extract_all(shopping_list, "\\\\b[a-z]+\\\\b") diff --git a/tests/testthat/test-extract.r b/tests/testthat/test-extract.r index fec3e0a4..e6386e89 100644 --- a/tests/testthat/test-extract.r +++ b/tests/testthat/test-extract.r @@ -36,3 +36,8 @@ test_that("str_extract extracts first match if found, NA otherwise", { expect_length(word_1_to_4, length(shopping_list)) expect_equal(word_1_to_4[1], NA_character_) }) + +test_that("can extract a group", { + expect_equal(str_extract("abc", "(.).(.)", group = 1), "a") + expect_equal(str_extract("abc", "(.).(.)", group = 2), "c") +})