Skip to content

Commit ed1684c

Browse files
craig[bot]sumeerbholaspilchen
committed
155413: tests: use admission.io.overload in admission-control/elastic-io r=tbg a=sumeerbhola The previously used sub-level metric was flawed, in that the IO overload score could stay low even at higher sub-level counts if L0 had very few bytes (which is a deliberate choice in admission control). So admission control would not throttle elastic work as aggressively as the test expected it to. Running with this change, we don't exceed a score of 0.15, while the previously used sub-level count metric spikes higher. For example: ``` 2025/10/14 22:13:15 admission_control_elastic_io.go:105: admission_io_overload(store=1): 0.100000 2025/10/14 22:13:25 admission_control_elastic_io.go:105: admission_io_overload(store=1): 0.100000 I251014 22:13:27.665296 868 util/admission/io_load_listener.go:780 ⋮ [T1,Vsystem,n1,s1] 2918 IO overload: compaction score 0.150 (131 ssts, 9 sub-levels), L0 growth 551 MiB (write 551 MiB (ignored 0 B) ingest 0 B (ignored 0 B)): requests 15985 (0 bypassed) with 505 MiB acc-write (0 B bypassed) + 0 B acc-ingest (0 B bypassed) + 551 MiB adjusted-LSM-writes + 4.2 GiB adjusted-disk-writes + write-model 1.09x+1 B (smoothed 1.08x+1 B) + l0-ingest-model 0.00x+0 B (smoothed 0.75x+1 B) + ingest-model 0.00x+0 B (smoothed 1.00x+1 B) + write-amp-model 7.87x+1 B (smoothed 8.01x+1 B) + at-admission-tokens 126 B, compacted 550 MiB [≈545 MiB], flushed 799 MiB [≈838 MiB] (mult 0.77); admitting 649 MiB (rate 43 MiB/s) (elastic 519 MiB rate 35 MiB/s) due to memtable flush (multiplier 0.775) (used total: 543 MiB elastic 541 MiB); write stalls 0; diskBandwidthLimiter (unlimited) (tokenUtilization 0.00, tokensUsed (elastic 0 B, snapshot 0 B, regular 0 B) tokens (write 0 B (prev 0 B), read 0 B (prev 0 B)), writeBW 0 B/s, readBW 0 B/s, provisioned 0 B/s) 2025/10/14 22:13:35 admission_control_elastic_io.go:105: admission_io_overload(store=1): 0.050000 I251014 22:13:42.666326 868 util/admission/io_load_listener.go:780 ⋮ [T1,Vsystem,n1,s1] 2926 IO overload: compaction score 0.050 (70 ssts, 5 sub-levels), L0 growth 498 MiB (write 498 MiB (ignored 0 B) ingest 0 B (ignored 0 B)): requests 15228 (0 bypassed) with 480 MiB acc-write (0 B bypassed) + 0 B acc-ingest (0 B bypassed) + 498 MiB adjusted-LSM-writes + 4.2 GiB adjusted-disk-writes + write-model 1.04x+1 B (smoothed 1.06x+1 B) + l0-ingest-model 0.00x+0 B (smoothed 0.75x+1 B) + ingest-model 0.00x+0 B (smoothed 1.00x+1 B) + write-amp-model 8.57x+1 B (smoothed 8.29x+1 B) + at-admission-tokens 153 B, compacted 498 MiB [≈522 MiB], flushed 883 MiB [≈860 MiB] (mult 0.77); admitting 667 MiB (rate 44 MiB/s) (elastic 533 MiB rate 36 MiB/s) due to memtable flush (multiplier 0.775) (used total: 519 MiB elastic 517 MiB); write stalls 0; diskBandwidthLimiter (unlimited) (tokenUtilization 0.00, tokensUsed (elastic 0 B, snapshot 0 B, regular 0 B) tokens (write 0 B (prev 0 B), read 0 B (prev 0 B)), writeBW 0 B/s, readBW 0 B/s, provisioned 0 B/s) ``` Fixes #148786 Fixes #156168 Fixes #156215 Epic: none Release note: None 155958: roachtestutil: use DETACHED option for INSPECT jobs r=spilchen a=spilchen Previously, CheckInspectDatabase used a statement timeout hack to background INSPECT jobs - it set a 5-second timeout and relied on the timeout error to leave jobs running. Now that INSPECT supports the DETACHED option, we use `INSPECT DATABASE <name> WITH OPTIONS DETACHED` to properly run jobs in the background. This provides a cleaner way to background the job. Informs #155676 Epic: CRDB-55075 Release note: none Co-authored-by: sumeerbhola <[email protected]> Co-authored-by: Matt Spilchen <[email protected]>
3 parents e7de41d + 577bb44 + e0f7fb4 commit ed1684c

File tree

2 files changed

+23
-51
lines changed

2 files changed

+23
-51
lines changed

pkg/cmd/roachtest/roachtestutil/validation_check.go

Lines changed: 4 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -206,17 +206,6 @@ func discoverUserDatabases(ctx context.Context, db *gosql.DB) ([]string, error)
206206
return databases, rows.Err()
207207
}
208208

209-
// isStatementTimeoutError returns true if the error is a statement timeout error.
210-
// Statement timeout errors are expected when launching INSPECT jobs since we only
211-
// want to start the job, not wait for it to complete.
212-
func isStatementTimeoutError(err error) bool {
213-
var pqErr *pq.Error
214-
if errors.As(err, &pqErr) {
215-
return pgcode.MakeCode(string(pqErr.Code)) == pgcode.QueryCanceled
216-
}
217-
return false
218-
}
219-
220209
// isFeatureNotSupportedError returns true if the error is a feature not supported error.
221210
// This can occur when the cluster version is not yet upgraded to support INSPECT.
222211
func isFeatureNotSupportedError(err error) bool {
@@ -229,8 +218,7 @@ func isFeatureNotSupportedError(err error) bool {
229218

230219
// launchInspectJobs launches INSPECT DATABASE commands in parallel for all
231220
// provided databases using task manager for concurrency control. Each INSPECT
232-
// command enables the inspect command for that connection. Statement timeout
233-
// errors are ignored as they indicate the job was successfully started.
221+
// command is launched with the DETACHED option to run in the background.
234222
// Feature not supported errors are returned to the caller, indicating the
235223
// cluster version does not support INSPECT.
236224
func launchInspectJobs(
@@ -242,7 +230,6 @@ func launchInspectJobs(
242230
return errors.Wrap(err, "failed to disable INSPECT admission control")
243231
}
244232

245-
statementTimeout := 5 * time.Second
246233
tm := task.NewManager(ctx, l)
247234
g := tm.NewErrorGroup()
248235

@@ -253,30 +240,13 @@ func launchInspectJobs(
253240

254241
statements := []string{
255242
"SET enable_inspect_command = true",
256-
fmt.Sprintf("SET statement_timeout = '%s'", statementTimeout.String()),
257-
fmt.Sprintf("INSPECT DATABASE %s", lexbase.EscapeSQLIdent(dbName)),
243+
fmt.Sprintf("INSPECT DATABASE %s WITH OPTIONS DETACHED", lexbase.EscapeSQLIdent(dbName)),
258244
}
259245

260-
var stmtErr error
261246
for _, stmt := range statements {
262247
if _, err := db.ExecContext(ctx, stmt); err != nil {
263-
stmtErr = err
264-
break
265-
}
266-
}
267-
268-
// Always reset statement timeout back to default.
269-
if _, err := db.ExecContext(ctx, "RESET statement_timeout"); err != nil {
270-
l.Printf("Warning: failed to reset statement timeout: %v", err)
271-
}
272-
273-
// Check for errors from the statements loop.
274-
if stmtErr != nil {
275-
// Statement timeout is expected - it means the job started but didn't complete
276-
// within the timeout. The job is still running in the background.
277-
if !isStatementTimeoutError(stmtErr) {
278-
l.Printf("INSPECT DATABASE %s failed to start: %v", dbName, stmtErr)
279-
return errors.Wrapf(stmtErr, "failed to start INSPECT DATABASE %s", dbName)
248+
l.Printf("INSPECT DATABASE %s failed to start: %v", dbName, err)
249+
return errors.Wrapf(err, "failed to start INSPECT DATABASE %s", dbName)
280250
}
281251
}
282252

pkg/cmd/roachtest/tests/admission_control_elastic_io.go

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ func registerElasticIO(r registry.Registry) {
8585
return nil
8686
})
8787
m.Go(func(ctx context.Context) error {
88-
const subLevelMetric = "storage_l0_sublevels"
88+
const ioOverloadMetric = "admission_io_overload"
8989
getMetricVal := func(metricName string) (float64, error) {
9090
point, err := statCollector.CollectPoint(ctx, t.L(), timeutil.Now(), metricName)
9191
if err != nil {
@@ -110,16 +110,16 @@ func registerElasticIO(r registry.Registry) {
110110
}
111111
now := timeutil.Now()
112112
endTime := now.Add(duration)
113-
// We typically see fluctuations from 1 to 5 sub-levels because the
114-
// elastic IO token logic gives 1.25*compaction-bandwidth tokens at 1
115-
// sub-level and 0.75*compaction-bandwidth at 5 sub-levels, with 5
116-
// sub-levels being very rare. We leave some breathing room and pick a
117-
// threshold of greater than 7 to fail the test. If elastic tokens are
118-
// not working, the threshold of 7 will be easily breached, since
119-
// regular tokens allow sub-levels to exceed 10.
120-
const subLevelThreshold = 7
121-
const sampleCountForL0Sublevel = 12
122-
var l0SublevelCount []float64
113+
// We typically see fluctuations from 0.05 to 0.25 IO overload score
114+
// because the elastic IO token logic gives 1.25*compaction-bandwidth
115+
// tokens at 0.05 score and 0.75*compaction-bandwidth at 0.25 score,
116+
// with 0.25 score being very rare. We leave some breathing room and
117+
// pick a threshold of greater than 0.35 to fail the test. If elastic
118+
// tokens are not working, the threshold of 0.35 will be easily
119+
// breached, since regular tokens allow the score to exceed 0.5.
120+
const ioOverloadThreshold = 0.35
121+
const sampleCountForIOOverload = 12
122+
var ioOverloadScore []float64
123123
// Sleep initially for stability to be achieved, before measuring.
124124
time.Sleep(5 * time.Minute)
125125
for {
@@ -129,17 +129,19 @@ func registerElasticIO(r registry.Registry) {
129129
default:
130130
}
131131
time.Sleep(10 * time.Second)
132-
val, err := getMetricVal(subLevelMetric)
132+
val, err := getMetricVal(ioOverloadMetric)
133133
if err != nil {
134134
continue
135135
}
136-
l0SublevelCount = append(l0SublevelCount, val)
136+
ioOverloadScore = append(ioOverloadScore, val)
137137
// We want to use the mean of the last 2m of data to avoid short-lived
138138
// spikes causing failures.
139-
if len(l0SublevelCount) >= sampleCountForL0Sublevel {
140-
latestSampleMeanL0Sublevels := roachtestutil.GetMeanOverLastN(sampleCountForL0Sublevel, l0SublevelCount)
141-
if latestSampleMeanL0Sublevels > subLevelThreshold {
142-
t.Fatalf("sub-level mean %f over last %d iterations exceeded threshold", latestSampleMeanL0Sublevels, sampleCountForL0Sublevel)
139+
if len(ioOverloadScore) >= sampleCountForIOOverload {
140+
latestSampleMeanIOOverloadScore :=
141+
roachtestutil.GetMeanOverLastN(sampleCountForIOOverload, ioOverloadScore)
142+
if latestSampleMeanIOOverloadScore > ioOverloadThreshold {
143+
t.Fatalf("io-overload score mean %f over last %d iterations exceeded threshold",
144+
latestSampleMeanIOOverloadScore, sampleCountForIOOverload)
143145
}
144146
}
145147
if timeutil.Now().After(endTime) {

0 commit comments

Comments
 (0)