Skip to content

Commit 56dcbf9

Browse files
committed
Merge branch 'master' into truehash
2 parents 15753e2 + 4d0b4a5 commit 56dcbf9

File tree

255 files changed

+35285
-9986
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

255 files changed

+35285
-9986
lines changed

.Rbuildignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
^\.github$
2020
^\.vscode$
2121
^\.zed$
22+
^\.lintr$
2223

2324
^\.gitlab-ci\.yml$
2425

.ci/.lintr.R

Lines changed: 45 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,21 @@ linters = c(dt_linters, all_linters(
99
packages = "lintr", # TODO(lintr->3.2.0): Remove this.
1010
# eq_assignment_linter(),
1111
brace_linter(allow_single_line = TRUE),
12+
implicit_integer_linter(allow_colon = TRUE),
1213
# TODO(michaelchirico): Activate these incrementally. These are the
1314
# parameterizations that match our style guide.
1415
# implicit_assignment_linter(allow_lazy = TRUE, allow_scoped = TRUE),
15-
# implicit_integer_linter(allow_colon = TRUE),
1616
# system_time_linter = undesirable_function_linter(c(
1717
# system.time = "Only run timings in benchmark.Rraw"
1818
# )),
19+
undesirable_function_linter(c(
20+
cat = "Use catf to enable translations",
21+
message = "Use messagef to avoid fragmented translations.",
22+
warning = "Use warningf to avoid fragmented translations.",
23+
stop = "Use stopf to avoid fragmented translations.",
24+
rev = "Use frev internally, or setfrev if by-reference is safe.",
25+
NULL
26+
)),
1927
# undesirable_function_linter(modify_defaults(
2028
# default_undesirable_functions,
2129
# ifelse = "Use fifelse instead.",
@@ -26,7 +34,8 @@ linters = c(dt_linters, all_linters(
2634
# setwd = NULL
2735
# )),
2836
undesirable_operator_linter(),
29-
# TODO(lintr#2441): Use upstream implementation.
37+
# TODO(lintr#2765): Use upstream implementation.
38+
# assignment_linter(operator = "="),
3039
assignment_linter = NULL,
3140
absolute_path_linter = NULL, # too many false positives
3241
# TODO(lintr#2442): Use this once x[ , j, by] is supported.
@@ -70,43 +79,39 @@ linters = c(dt_linters, all_linters(
7079
))
7180
rm(dt_linters)
7281

73-
# TODO(lintr#2172): Glob with lintr itself.
74-
exclusions = c(local({
75-
exclusion_for_dir <- function(dir, exclusions) {
76-
files = file.path("..", list.files(dir, pattern = "\\.(R|Rmd|Rraw)$", full.names=TRUE))
77-
stats::setNames(rep(list(exclusions), length(files)), files)
78-
}
79-
c(
80-
exclusion_for_dir("tests", list(
81-
quotes_linter = Inf,
82-
# TODO(michaelchirico): Enforce these and re-activate them one-by-one.
83-
implicit_integer_linter = Inf,
84-
infix_spaces_linter = Inf,
85-
undesirable_function_linter = Inf
86-
)),
87-
exclusion_for_dir(c("vignettes", "vignettes/fr"), list(
88-
quotes_linter = Inf,
89-
sample_int_linter = Inf
90-
# strings_as_factors_linter = Inf
91-
# system_time_linter = Inf
92-
)),
93-
exclusion_for_dir("inst/tests", list(
94-
library_call_linter = Inf,
95-
numeric_leading_zero_linter = Inf,
96-
undesirable_operator_linter = Inf, # For ':::', possibly we could be more careful to only exclude ':::'.
97-
# TODO(michaelchirico): Enforce these and re-activate them one-by-one.
98-
comparison_negation_linter = Inf,
99-
condition_call_linter = Inf,
100-
duplicate_argument_linter = Inf,
101-
equals_na_linter = Inf,
102-
missing_argument_linter = Inf,
103-
paste_linter = Inf,
104-
rep_len_linter = Inf,
105-
sample_int_linter = Inf,
106-
seq_linter = Inf,
107-
unnecessary_lambda_linter = Inf
108-
))
82+
exclusions = list(
83+
`../tests` = list(
84+
quotes_linter = Inf,
85+
# TODO(michaelchirico): Enforce these and re-activate them one-by-one.
86+
implicit_integer_linter = Inf,
87+
infix_spaces_linter = Inf,
88+
undesirable_function_linter = Inf
89+
),
90+
`../vignettes*` = list(
91+
# assignment_linter = Inf,
92+
implicit_integer_linter = Inf,
93+
quotes_linter = Inf,
94+
sample_int_linter = Inf
95+
# strings_as_factors_linter = Inf
96+
# system_time_linter = Inf
97+
),
98+
`../inst/tests` = list(
99+
library_call_linter = Inf,
100+
numeric_leading_zero_linter = Inf,
101+
undesirable_operator_linter = Inf, # For ':::', possibly we could be more careful to only exclude ':::'.
102+
# TODO(michaelchirico): Enforce these and re-activate them one-by-one.
103+
comparison_negation_linter = Inf,
104+
condition_call_linter = Inf,
105+
duplicate_argument_linter = Inf,
106+
equals_na_linter = Inf,
107+
missing_argument_linter = Inf,
108+
paste_linter = Inf,
109+
rep_len_linter = Inf,
110+
sample_int_linter = Inf,
111+
seq_linter = Inf,
112+
unnecessary_lambda_linter = Inf
113+
),
114+
`../inst/tests/froll.Rraw` = list(
115+
dt_test_literal_linter = Inf # TODO(michaelchirico): Fix these once #5898, #5692, #5682, #5576, #5575, #5441 are merged.
109116
)
110-
}),
111-
list(`../inst/tests/froll.Rraw` = list(dt_test_literal_linter = Inf)) # TODO(michaelchirico): Fix these once #5898, #5692, #5682, #5576, #5575, #5441 are merged.
112117
)

.ci/README.md

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# data.table continuous integration and deployment
22

3-
On each Pull Request opened in GitHub we run GitHub Actions test jobs to provide prompt feedback about the status of PR. Our more thorough main CI pipeline runs nightly on GitLab CI. GitLab repository automatically mirrors our GitHub repository and runs pipeline on `master` branch every night. It tests more environments and different configurations. It publishes a variety of artifacts such as our [homepage](https://rdatatable.gitlab.io/data.table/) and [CRAN-like website for dev version](https://rdatatable.gitlab.io/data.table/web/packages/data.table/index.html), including windows binaries for the dev version.
3+
On each Pull Request opened in GitHub we run GitHub Actions test jobs to provide prompt feedback about the status of PR. Our more thorough main CI pipeline runs nightly on GitLab CI. In addition to branches pushed directly, the GitLab repository automatically mirrors our GitHub repository and runs pipeline on the `master` branch every night. It tests more environments and different configurations. It publishes a variety of artifacts such as our [homepage](https://rdatatable.gitlab.io/data.table/) and [CRAN-like website for dev version](https://rdatatable.gitlab.io/data.table/web/packages/data.table/index.html), including windows binaries for the dev version.
44

55
## Environments
66

@@ -12,11 +12,16 @@ Test jobs:
1212
- `test-lin-rel-cran` - `--as-cran` on Linux, strict test for final status of `R CMD check`.
1313
- `test-lin-dev-gcc-strict-cran` - `--as-cran` on Linux, `r-devel` built with `-enable-strict-barrier --disable-long-double`, test for compilation warnings, test for new NOTEs/WARNINGs from `R CMD check`.
1414
- `test-lin-dev-clang-cran` - same as `gcc-strict` job but R built with `clang` and no `--enable-strict-barrier --disable-long-double` flags.
15-
- `test-lin-310-cran` - R 3.1.0 on Linux, stated R dependency version.
15+
- `test-lin-ancient-cran` - Stated R dependency version (currently 3.4.0) on Linux.
16+
- `test-lin-dev-clang-san` - `r-devel` on Linux built with `clang -fsanitize=address,undefined` (including LeakSanitizer), test for sanitizer output in tests and examples.
17+
- `test-lin-dev-gcc-san` - `r-devel` on Linux built with `gcc -fsanitize=address,undefined` (including LeakSanitizer), test for sanitizer output in tests and examples.
1618
- `test-win-rel` - `r-release` on Windows.
1719
- `test-win-dev` - `r-devel` on Windows.
1820
- `test-win-old` - `r-oldrel` on Windows.
19-
- `test-mac-rel` - macOS build not yet available, see [#3326](https://github.com/Rdatatable/data.table/issues/3326) for status
21+
- `test-mac-rel` - `r-release` on macOS.
22+
- `test-mac-old` - `r-oldrel` on macOS.
23+
24+
The CI steps for the tests are [required](https://github.com/Rdatatable/data.table/blob/55eb0f160b169398d51f138131c14a66c86e5dc9/.ci/publish.R#L162-L168) to be named according to the pattern `test-(lin|win|mac)-<R version>[-<suffix>]*`, where `<R version>` is `rel`, `dev`, `old`, `ancient`, or three digits comprising an R version (e.g. `362` corresponding to R-3.6.2).
2025

2126
Tests jobs are allowed to fail, summary and logs of test jobs are later published at _CRAN-like checks_ page, see artifacts below.
2227

@@ -45,6 +50,23 @@ Base R implemented helper script, [originally proposed to base R](https://svn.r-
4550

4651
Base R implemented helper script to orchestrate generation of most artifacts and to arrange them nicely. It is being used only in [_integration_ stage in GitLab CI pipeline](./../.gitlab-ci.yml).
4752

53+
### [`lint.R`](./lint.R)
54+
55+
Base R runner for the manual (non-`lintr`) lint checks to be run from GitHub Actions during the code quality check. The command line arguments are as follows:
56+
1. Path to the directory containing files defining the linters. A linter is a function that accepts one argument (typically the path to the file) and signals an error if it fails the lint check.
57+
2. Path to the directory containing files to check.
58+
3. A regular expression matching the files to check.
59+
60+
One of the files in the linter directory may define the `.preprocess` function, which must accept one file path and return a value that other linter functions will understand. The function may also return `NULL` to indicate that the file must be skipped.
61+
62+
Example command lines:
63+
64+
```sh
65+
Rscript .ci/lint.R .ci/linters/c src '[.][ch]$'
66+
Rscript .ci/lint.R .ci/linters/po po '[.]po$'
67+
Rscript .ci/lint.R .ci/linters/md . '[.]R?md$'
68+
```
69+
4870
## GitLab Open Source Program
4971

5072
We are currently part of the [GitLab for Open Source Program](https://about.gitlab.com/solutions/open-source/). This gives us 50,000 compute minutes per month for our GitLab CI. Our license needs to be renewed yearly (around July) and is currently managed by @ben-schwen.

.ci/atime/tests.R

Lines changed: 56 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
pval.thresh <- 0.001 # to reduce false positives.
2+
13
# Test case adapted from https://github.com/Rdatatable/data.table/issues/6105#issue-2268691745 which is where the issue was reported.
24
# https://github.com/Rdatatable/data.table/pull/6107 fixed performance across 3 ways to specify a column as Date, and we test each individually.
35
extra.args.6107 <- c(
@@ -13,6 +15,7 @@ for (extra.arg in extra.args.6107){
1315
tmp_csv = tempfile()
1416
fwrite(DT, tmp_csv)
1517
},
18+
FasterIO = "60a01fa65191c44d7997de1843e9a1dfe5be9f72", # First commit of the PR (https://github.com/Rdatatable/data.table/pull/6925/commits) that reduced time usage
1619
Slow = "e9087ce9860bac77c51467b19e92cf4b72ca78c7", # Parent of the merge commit (https://github.com/Rdatatable/data.table/commit/a77e8c22e44e904835d7b34b047df2eff069d1f2) of the PR (https://github.com/Rdatatable/data.table/pull/6107) that fixes the issue
1720
Fast = "a77e8c22e44e904835d7b34b047df2eff069d1f2") # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/6107) that fixes the issue
1821
this.test$expr = str2lang(sprintf("data.table::fread(tmp_csv, %s)", extra.arg))
@@ -117,12 +120,29 @@ test.list <- atime::atime_test_list(
117120
file.path("src", "init.c"),
118121
paste0("R_init_", Package_regex),
119122
paste0("R_init_", gsub("[.]", "_", new.Package_)))
123+
# allow compilation on new R versions where 'Calloc' is not defined
124+
pkg_find_replace(
125+
file.path("src", "*.c"),
126+
"\\b(Calloc|Free|Realloc)\\b",
127+
"R_\\1")
120128
pkg_find_replace(
121129
"NAMESPACE",
122130
sprintf('useDynLib\\("?%s"?', Package_regex),
123131
paste0('useDynLib(', new.Package_))
124132
},
125133

134+
# Constant overhead improvement https://github.com/Rdatatable/data.table/pull/6925
135+
# Test case adapted from https://github.com/Rdatatable/data.table/pull/7022#discussion_r2107900643
136+
"fread disk overhead improved in #6925" = atime::atime_test(
137+
N = 2^seq(0, 20), # smaller N because we are doing multiple fread calls.
138+
setup = {
139+
fwrite(iris[1], iris.csv <- tempfile())
140+
},
141+
expr = replicate(N, data.table::fread(iris.csv)),
142+
Fast = "60a01fa65191c44d7997de1843e9a1dfe5be9f72", # First commit of the PR (https://github.com/Rdatatable/data.table/pull/6925/commits) that reduced time usage
143+
Slow = "e25ea80b793165094cea87d946d2bab5628f70a6" # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/60a01fa65191c44d7997de1843e9a1dfe5be9f72)
144+
),
145+
126146
# Performance regression discussed in https://github.com/Rdatatable/data.table/issues/4311
127147
# Test case adapted from https://github.com/Rdatatable/data.table/pull/4440#issuecomment-632842980 which is the fix PR.
128148
"shallow regression fixed in #4440" = atime::atime_test(
@@ -172,8 +192,9 @@ test.list <- atime::atime_test_list(
172192
# Fixed in https://github.com/Rdatatable/data.table/pull/4558
173193
"DT[by] fixed in #4558" = atime::atime_test(
174194
setup = {
195+
N9 <- as.integer(N * 0.9)
175196
d <- data.table(
176-
id = sample(c(seq.int(N * 0.9), sample(N * 0.9, N * 0.1, TRUE))),
197+
id = sample(c(seq.int(N9), sample(N9, N-N9, TRUE))),
177198
v1 = sample(5L, N, TRUE),
178199
v2 = sample(5L, N, TRUE)
179200
)
@@ -231,7 +252,39 @@ test.list <- atime::atime_test_list(
231252
},
232253
expr = data.table:::melt(DT, measure.vars = measure.vars),
233254
Slow = "fd24a3105953f7785ea7414678ed8e04524e6955", # Parent of the merge commit (https://github.com/Rdatatable/data.table/commit/ed72e398df76a0fcfd134a4ad92356690e4210ea) of the PR (https://github.com/Rdatatable/data.table/pull/5054) that fixes the issue
234-
Fast = "ed72e398df76a0fcfd134a4ad92356690e4210ea"), # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/5054) that fixes the issue
255+
Fast = "ed72e398df76a0fcfd134a4ad92356690e4210ea"), # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/5054) that fixes the issue # Test case created directly using the atime code below (not adapted from any other benchmark), based on the issue/fix PR https://github.com/Rdatatable/data.table/pull/5054#issue-930603663 "melt should be more efficient when there are missing input columns."
256+
257+
# Test case created from @tdhock's comment https://github.com/Rdatatable/data.table/pull/6393#issuecomment-2327396833, in turn adapted from @philippechataignon's comment https://github.com/Rdatatable/data.table/pull/6393#issuecomment-2326714012
258+
"fwrite refactored in #6393" = atime::atime_test(
259+
setup = {
260+
set.seed(1)
261+
NC = 10L
262+
L <- data.table(i=1:N)
263+
L[, paste0("V", 1:NC) := replicate(NC, rnorm(N), simplify=FALSE)]
264+
out.csv <- tempfile()
265+
},
266+
expr = data.table::fwrite(L, out.csv, compress="gzip"),
267+
Before = "f339aa64c426a9cd7cf2fcb13d91fc4ed353cd31", # Parent of the first commit https://github.com/Rdatatable/data.table/commit/fcc10d73a20837d0f1ad3278ee9168473afa5ff1 in the PR https://github.com/Rdatatable/data.table/pull/6393/commits with major change to fwrite with gzip.
268+
PR = "3630413ae493a5a61b06c50e80d166924d2ef89a"), # Close-to-last merge commit in the PR.
269+
270+
# Test case created directly using the atime code below (not adapted from any other benchmark), based on the PR, Removes unnecessary data.table call from as.data.table.array https://github.com/Rdatatable/data.table/pull/7010
271+
"as.data.table.array improved in #7010" = atime::atime_test(
272+
setup = {
273+
dims = c(N, 1, 1)
274+
arr = array(seq_len(prod(dims)), dim=dims)
275+
},
276+
expr = data.table:::as.data.table.array(arr, na.rm=FALSE),
277+
Slow = "73d79edf8ff8c55163e90631072192301056e336", # Parent of the first commit in the PR (https://github.com/Rdatatable/data.table/commit/8397dc3c993b61a07a81c786ca68c22bc589befc)
278+
Fast = "8397dc3c993b61a07a81c786ca68c22bc589befc"), # Commit in the PR (https://github.com/Rdatatable/data.table/pull/7019/commits) that removes inefficiency
279+
280+
"isoweek improved in #7144" = atime::atime_test(
281+
setup = {
282+
set.seed(349)
283+
x = sample(Sys.Date() - 0:5000, N, replace=TRUE)
284+
},
285+
expr = data.table::isoweek(x),
286+
Slow = "548410d23dd74b625e8ea9aeb1a5d2e9dddd2927", # Parent of the first commit in the PR (https://github.com/Rdatatable/data.table/commit/548410d23dd74b625e8ea9aeb1a5d2e9dddd2927)
287+
Fast = "c0b32a60466bed0e63420ec105bc75c34590865e"), # Commit in the PR (https://github.com/Rdatatable/data.table/pull/7144/commits) that uses a much faster implementation
235288

236-
tests=extra.test.list)
289+
tests=extra.test.list)
237290
# nolint end: undesirable_operator_linter.

.ci/ci.R

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -155,10 +155,11 @@ function(pkgs,
155155
db <- utils::available.packages(repos.url, type = type)
156156
allpkgs <- c(pkgs, unlist(tools::package_dependencies(unique(pkgs), db, which, recursive = TRUE), use.names = FALSE))
157157
except <- c("R", unlist(tools:::.get_standard_package_names()[except.priority], use.names = FALSE))
158-
## do not re-download existing packages, ignore version
158+
## do not re-download existing packages with the right version
159159
if (length(except.repodir) && file.exists(file.path(contrib.url(except.repodir, type = type, ver = binary.ver), "PACKAGES"))) {
160160
except.curl <- contrib.url(file.path("file:", normalizePath(except.repodir)), type = type, ver = binary.ver)
161-
except <- c(except, rownames(utils::available.packages(except.curl, type = type, fields = "Package")))
161+
except.db <- utils::available.packages(except.curl, type = type, fields = "Package")
162+
except <- c(except, merge(db, except.db, by = c("Package", "Version", "MD5sum"))[,"Package"])
162163
}
163164
newpkgs <- setdiff(allpkgs, except)
164165
if (!all(availpkgs<-newpkgs %in% rownames(db))) {
@@ -174,6 +175,13 @@ function(pkgs,
174175
"source" = "tar.gz",
175176
"mac.binary" = "tgz",
176177
"win.binary" = "zip")
178+
## clean up stale package files for which new versions will be downloaded
179+
if (file.exists(file.path(destdir, "PACKAGES"))) {
180+
repo.db <- utils::available.packages(file.path("file:", normalizePath(destdir)), type = type)
181+
oldver <- repo.db[repo.db[, "Package"] %in% newpkgs, c("Package", "Version"), drop=FALSE]
182+
oldfiles <- file.path(destdir, sprintf("%s_%s.%s", oldver[,"Package"], oldver[,"Version"], pkgsext))
183+
unlink(oldfiles[file.exists(oldfiles)])
184+
}
177185
pkgsver <- db[db[, "Package"] %in% newpkgs, c("Package", "Version"), drop=FALSE]
178186
dlfiles <- file.path(destdir, sprintf("%s_%s.%s", pkgsver[,"Package"], pkgsver[,"Version"], pkgsext))
179187
unlink(dlfiles[file.exists(dlfiles)])

.ci/lint.R

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#!/usr/bin/Rscript
2+
# Runner for the manual lint checks in .ci/linters
3+
args = commandArgs(TRUE)
4+
if (identical(args, '--help')) {
5+
writeLines(c(
6+
'Usage: Rscript .ci/lint.R .ci/linters/<KIND> <WHERE> <WHAT> [PREPROCESS]',
7+
'KIND must name the directory containing the *.R files defining the linter functions.',
8+
'WHERE must name the directory containing the files to lint, e.g. "po", or "src".',
9+
"WHAT must contain the regular expression matching the files to lint, e.g., '[.]po$', or '[.][ch]$'.",
10+
))
11+
q('no')
12+
}
13+
stopifnot(`Invalid arguments, see .ci/lint.R --help` = length(args) == 3)
14+
15+
linter_env = list2env(list(.preprocess = identity))
16+
for (f in list.files(args[[1]], full.names=TRUE)) sys.source(f, linter_env)
17+
if (!length(ls(linter_env))) stop(
18+
"No linters found after sourcing files in ", dQuote(args[[1]])
19+
)
20+
21+
sources = list.files(args[[2]], pattern = args[[3]], full.names = TRUE, recursive = TRUE)
22+
if (!length(sources)) stop(
23+
"No files to lint found in directory ", dQuote(args[[2]]), " for mask ", dQuote(args[[3]])
24+
)
25+
sources = Filter(Negate(is.null), lapply(setNames(nm = sources), linter_env$.preprocess))
26+
27+
okay = TRUE
28+
for (src in names(sources))
29+
for (linter in ls(linter_env)) tryCatch(
30+
linter_env[[linter]](sources[[src]]),
31+
error = function(e) {
32+
message('Source file ', dQuote(src), ' failed lint check ', dQuote(linter), ': ', conditionMessage(e))
33+
okay <<- FALSE
34+
}
35+
)
36+
stopifnot(`Please fix the issues above.` = okay)

.ci/linters/c/00preprocess.R

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
.preprocess = function (f) list(
2+
c_obj = f, lines = readLines(f),
3+
preprocessed = system2(
4+
"gcc", shQuote(c("-fpreprocessed", "-E", f)),
5+
stdout = TRUE, stderr = FALSE
6+
)
7+
)

0 commit comments

Comments
 (0)