Skip to content

Commit d4e4682

Browse files
committed
server: return draining progress if the server controller is draining
This commit fixes a bug where if the server controller is being drained, the drain command would return a response without populating the IsDraining. The CLI interprets this as if the draining is completed. server: return draining progress if the server controller is draining This commit fixes a bug where if the server controller is being drained, the drain command would return a response without populating the IsDraining. The CLI interprets this as if the draining is completed. Output example when running: ``` ./cockroach node drain 3 --insecure --url {pgurl:3} --logtostderr=INFO I251008 15:54:42.613200 15 2@rpc/peer.go:613 [rnode=?,raddr=10.142.1.228:26257,class=system,rpc] 1 connection is now healthy node is draining... remaining: 3 I251008 15:54:42.622586 1 cli/rpc_node_shutdown.go:184 [-] 2 drain details: tenant servers: 3 node is draining... remaining: 3 I251008 15:54:42.824526 1 cli/rpc_node_shutdown.go:184 [-] 3 drain details: tenant servers: 3 node is draining... remaining: 1 I251008 15:54:43.026405 1 cli/rpc_node_shutdown.go:184 [-] 4 drain details: tenant servers: 1 node is draining... remaining: 1 I251008 15:54:43.228596 1 cli/rpc_node_shutdown.go:184 [-] 5 drain details: tenant servers: 1 node is draining... remaining: 243 I251008 15:54:44.580413 1 cli/rpc_node_shutdown.go:184 [-] 6 drain details: liveness record: 1, range lease iterations: 175, descriptor leases: 67 node is draining... remaining: 0 (complete) drain ok ``` Release note: None Epic: None Release note: None Epic: None
1 parent cb76195 commit d4e4682

File tree

3 files changed

+51
-11
lines changed

3 files changed

+51
-11
lines changed

pkg/server/drain.go

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -389,10 +389,13 @@ func (s *drainServer) drainInner(
389389
return s.drainNode(ctx, reporter, verbose)
390390
}
391391

392-
// isDraining returns true if either SQL client connections are being drained
393-
// or if one of the stores on the node is not accepting replicas.
392+
// isDraining returns true if the SQL client connections are being drained,
393+
// if one of the stores on the node is not accepting replicas, or if the
394+
// server controller is being drained.
394395
func (s *drainServer) isDraining() bool {
395-
return s.sqlServer.pgServer.IsDraining() || (s.kvServer.node != nil && s.kvServer.node.IsDraining())
396+
return s.sqlServer.pgServer.IsDraining() ||
397+
(s.kvServer.node != nil && s.kvServer.node.IsDraining()) ||
398+
(s.serverCtl != nil && s.serverCtl.IsDraining())
396399
}
397400

398401
// drainClients starts draining the SQL layer.
@@ -410,7 +413,10 @@ func (s *drainServer) drainClientsInternal(
410413
var cancel context.CancelFunc
411414
ctx, cancel = context.WithCancel(ctx)
412415
defer cancel()
413-
shouldDelayDraining := !s.isDraining()
416+
417+
// If this is the first time we are draining the clients, we delay draining
418+
// to allow health probes to notice that the node is not ready.
419+
shouldDelayDraining := !s.sqlServer.pgServer.IsDraining()
414420

415421
// Set the gRPC mode of the node to "draining" and mark the node as "not ready".
416422
// Probes to /health?ready=1 will now notice the change in the node's readiness.

pkg/server/drain_test.go

Lines changed: 37 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ package server_test
77

88
import (
99
"context"
10+
gosql "database/sql"
1011
"io"
1112
"testing"
1213
"time"
@@ -34,15 +35,35 @@ import (
3435
func TestDrain(t *testing.T) {
3536
defer leaktest.AfterTest(t)()
3637
defer log.Scope(t).Close(t)
37-
doTestDrain(t)
38+
testutils.RunTrueAndFalse(t, "secondaryTenants", func(t *testing.T, secondaryTenants bool) {
39+
doTestDrain(t, secondaryTenants)
40+
})
3841
}
3942

4043
// doTestDrain runs the drain test.
41-
func doTestDrain(tt *testing.T) {
44+
func doTestDrain(tt *testing.T, secondaryTenants bool) {
4245
var drainSleepCallCount = 0
4346
t := newTestDrainContext(tt, &drainSleepCallCount)
4447
defer t.Close()
4548

49+
var tenantConn *gosql.DB
50+
if secondaryTenants {
51+
_, err := t.tc.ServerConn(0).Exec("CREATE TENANT hello")
52+
require.NoError(tt, err)
53+
_, err = t.tc.ServerConn(0).Exec("ALTER TENANT hello START SERVICE SHARED")
54+
require.NoError(tt, err)
55+
56+
// Wait for the tenant to be ready by establishing a test connection.
57+
testutils.SucceedsSoon(tt, func() error {
58+
tenantConn, err = t.tc.Server(0).SystemLayer().SQLConnE(
59+
serverutils.DBName("cluster:hello/defaultdb"))
60+
if err != nil {
61+
return err
62+
}
63+
return tenantConn.Ping()
64+
})
65+
}
66+
4667
// Issue a probe. We're not draining yet, so the probe should
4768
// reflect that.
4869
resp := t.sendProbe()
@@ -54,7 +75,12 @@ func doTestDrain(tt *testing.T) {
5475
resp = t.sendDrainNoShutdown()
5576
t.assertDraining(resp, true)
5677
t.assertRemaining(resp, true)
57-
t.assertEqual(1, drainSleepCallCount)
78+
if !secondaryTenants {
79+
// If we have secondary tenants, we might not have waited before draining
80+
// the clients at this point because we might have returned early if we are
81+
// still draining the serverController.
82+
t.assertEqual(1, drainSleepCallCount)
83+
}
5884

5985
// Issue another probe. This checks that the server is still running
6086
// (i.e. Shutdown: false was effective), the draining status is
@@ -64,12 +90,14 @@ func doTestDrain(tt *testing.T) {
6490
t.assertDraining(resp, true)
6591
// probe-only has no remaining.
6692
t.assertRemaining(resp, false)
67-
t.assertEqual(1, drainSleepCallCount)
68-
93+
if !secondaryTenants {
94+
t.assertEqual(1, drainSleepCallCount)
95+
}
6996
// Repeat drain commands until we verify that there are zero remaining leases
7097
// (i.e. complete). Also validate that the server did not sleep again.
71-
testutils.SucceedsSoon(t, func() error {
98+
testutils.SucceedsWithin(t, func() error {
7299
resp = t.sendDrainNoShutdown()
100+
t.Logf("drain response: %+v", resp)
73101
if !resp.IsDraining {
74102
return errors.Newf("expected draining")
75103
}
@@ -78,7 +106,8 @@ func doTestDrain(tt *testing.T) {
78106
resp.DrainRemainingDescription)
79107
}
80108
return nil
81-
})
109+
}, 10*time.Minute)
110+
82111
t.assertEqual(1, drainSleepCallCount)
83112

84113
// Now issue a drain request without drain but with shutdown.
@@ -190,6 +219,7 @@ func newTestDrainContext(t *testing.T, drainSleepCallCount *int) *testDrainConte
190219
// care about TLS settings for the RPC client connection.
191220
ServerArgs: base.TestServerArgs{
192221
DefaultDRPCOption: base.TestDRPCDisabled,
222+
DefaultTestTenant: base.TestControlsTenantsExplicitly,
193223
Knobs: base.TestingKnobs{
194224
Server: &server.TestingKnobs{
195225
DrainSleepFn: func(time.Duration) {

pkg/server/server_controller.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,10 @@ func (s *serverController) SetDraining() {
172172
}
173173
}
174174

175+
func (s *serverController) IsDraining() bool {
176+
return s.draining.Load()
177+
}
178+
175179
// start monitors changes to the service mode and updates
176180
// the running servers accordingly.
177181
func (c *serverController) start(ctx context.Context, ie isql.Executor) error {

0 commit comments

Comments
 (0)