Skip to content

Cut-over should wait for heartbeat lag to be low enough to succeed #921

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
May 7, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/hooks.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ The following variables are available on all hooks:
- `GH_OST_ESTIMATED_ROWS` - estimated total rows in table
- `GH_OST_COPIED_ROWS` - number of rows copied by `gh-ost`
- `GH_OST_INSPECTED_LAG` - lag in seconds (floating point) of inspected server
- `GH_OST_HEARTBEAT_LAG` - lag in seconds (floating point) of heartbeat
- `GH_OST_PROGRESS` - progress pct ([0..100], floating point) of migration
- `GH_OST_MIGRATED_HOST`
- `GH_OST_INSPECTED_HOST`
Expand Down
21 changes: 21 additions & 0 deletions go/base/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,8 @@ type MigrationContext struct {
RenameTablesEndTime time.Time
pointOfInterestTime time.Time
pointOfInterestTimeMutex *sync.Mutex
lastHeartbeatOnChangelogTime time.Time
lastHeartbeatOnChangelogMutex *sync.Mutex
CurrentLag int64
currentProgress uint64
ThrottleHTTPStatusCode int64
Expand Down Expand Up @@ -271,6 +273,7 @@ func NewMigrationContext() *MigrationContext {
throttleControlReplicaKeys: mysql.NewInstanceKeyMap(),
configMutex: &sync.Mutex{},
pointOfInterestTimeMutex: &sync.Mutex{},
lastHeartbeatOnChangelogMutex: &sync.Mutex{},
ColumnRenameMap: make(map[string]string),
PanicAbort: make(chan error),
Log: NewDefaultLogger(),
Expand Down Expand Up @@ -454,6 +457,10 @@ func (this *MigrationContext) MarkRowCopyEndTime() {
this.RowCopyEndTime = time.Now()
}

func (this *MigrationContext) TimeSinceLastHeartbeatOnChangelog() time.Duration {
return time.Since(this.GetLastHeartbeatOnChangelogTime())
}

func (this *MigrationContext) GetCurrentLagDuration() time.Duration {
return time.Duration(atomic.LoadInt64(&this.CurrentLag))
}
Expand Down Expand Up @@ -493,6 +500,20 @@ func (this *MigrationContext) TimeSincePointOfInterest() time.Duration {
return time.Since(this.pointOfInterestTime)
}

func (this *MigrationContext) SetLastHeartbeatOnChangelogTime(t time.Time) {
this.lastHeartbeatOnChangelogMutex.Lock()
defer this.lastHeartbeatOnChangelogMutex.Unlock()

this.lastHeartbeatOnChangelogTime = t
}

func (this *MigrationContext) GetLastHeartbeatOnChangelogTime() time.Time {
this.lastHeartbeatOnChangelogMutex.Lock()
defer this.lastHeartbeatOnChangelogMutex.Unlock()

return this.lastHeartbeatOnChangelogTime
}

func (this *MigrationContext) SetHeartbeatIntervalMilliseconds(heartbeatIntervalMilliseconds int64) {
if heartbeatIntervalMilliseconds < 100 {
heartbeatIntervalMilliseconds = 100
Expand Down
1 change: 1 addition & 0 deletions go/logic/hooks.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ func (this *HooksExecutor) applyEnvironmentVariables(extraVariables ...string) [
env = append(env, fmt.Sprintf("GH_OST_INSPECTED_HOST=%s", this.migrationContext.GetInspectorHostname()))
env = append(env, fmt.Sprintf("GH_OST_EXECUTING_HOST=%s", this.migrationContext.Hostname))
env = append(env, fmt.Sprintf("GH_OST_INSPECTED_LAG=%f", this.migrationContext.GetCurrentLagDuration().Seconds()))
env = append(env, fmt.Sprintf("GH_OST_HEARTBEAT_LAG=%f", this.migrationContext.TimeSinceLastHeartbeatOnChangelog().Seconds()))
env = append(env, fmt.Sprintf("GH_OST_PROGRESS=%f", this.migrationContext.GetProgressPct()))
env = append(env, fmt.Sprintf("GH_OST_HOOKS_HINT=%s", this.migrationContext.HooksHintMessage))
env = append(env, fmt.Sprintf("GH_OST_HOOKS_HINT_OWNER=%s", this.migrationContext.HooksHintOwner))
Expand Down
38 changes: 33 additions & 5 deletions go/logic/migrator.go
Original file line number Diff line number Diff line change
Expand Up @@ -207,12 +207,20 @@ func (this *Migrator) canStopStreaming() bool {
return atomic.LoadInt64(&this.migrationContext.CutOverCompleteFlag) != 0
}

// onChangelogStateEvent is called when a binlog event operation on the changelog table is intercepted.
func (this *Migrator) onChangelogStateEvent(dmlEvent *binlog.BinlogDMLEvent) (err error) {
// onChangelogEvent is called when a binlog event operation on the changelog table is intercepted.
func (this *Migrator) onChangelogEvent(dmlEvent *binlog.BinlogDMLEvent) (err error) {
// Hey, I created the changelog table, I know the type of columns it has!
if hint := dmlEvent.NewColumnValues.StringColumn(2); hint != "state" {
switch hint := dmlEvent.NewColumnValues.StringColumn(2); hint {
case "state":
return this.onChangelogStateEvent(dmlEvent)
case "heartbeat":
return this.onChangelogHeartbeatEvent(dmlEvent)
default:
return nil
}
}

func (this *Migrator) onChangelogStateEvent(dmlEvent *binlog.BinlogDMLEvent) (err error) {
changelogStateString := dmlEvent.NewColumnValues.StringColumn(3)
changelogState := ReadChangelogState(changelogStateString)
this.migrationContext.Log.Infof("Intercepted changelog state %s", changelogState)
Expand Down Expand Up @@ -245,6 +253,18 @@ func (this *Migrator) onChangelogStateEvent(dmlEvent *binlog.BinlogDMLEvent) (er
return nil
}

func (this *Migrator) onChangelogHeartbeatEvent(dmlEvent *binlog.BinlogDMLEvent) (err error) {
changelogHeartbeatString := dmlEvent.NewColumnValues.StringColumn(3)

heartbeatTime, err := time.Parse(time.RFC3339Nano, changelogHeartbeatString)
if err != nil {
return this.migrationContext.Log.Errore(err)
} else {
this.migrationContext.SetLastHeartbeatOnChangelogTime(heartbeatTime)
return nil
}
}

// listenOnPanicAbort aborts on abort request
func (this *Migrator) listenOnPanicAbort() {
err := <-this.migrationContext.PanicAbort
Expand Down Expand Up @@ -476,6 +496,13 @@ func (this *Migrator) cutOver() (err error) {
this.migrationContext.Log.Debugf("checking for cut-over postpone")
this.sleepWhileTrue(
func() (bool, error) {
heartbeatLag := this.migrationContext.TimeSinceLastHeartbeatOnChangelog()
maxLagMillisecondsThrottle := time.Duration(atomic.LoadInt64(&this.migrationContext.MaxLagMillisecondsThrottleThreshold)) * time.Millisecond
cutOverLockTimeout := time.Duration(this.migrationContext.CutOverLockTimeoutSeconds) * time.Second
if heartbeatLag > maxLagMillisecondsThrottle || heartbeatLag > cutOverLockTimeout {
this.migrationContext.Log.Debugf("current HeartbeatLag (%.2fs) is too high, it needs to be less than both --max-lag-millis (%.2fs) and --cut-over-lock-timeout-seconds (%.2fs) to continue", heartbeatLag.Seconds(), maxLagMillisecondsThrottle.Seconds(), cutOverLockTimeout.Seconds())
return true, nil
}
if this.migrationContext.PostponeCutOverFlagFile == "" {
return false, nil
}
Expand Down Expand Up @@ -962,13 +989,14 @@ func (this *Migrator) printStatus(rule PrintStatusRule, writers ...io.Writer) {

currentBinlogCoordinates := *this.eventsStreamer.GetCurrentBinlogCoordinates()

status := fmt.Sprintf("Copy: %d/%d %.1f%%; Applied: %d; Backlog: %d/%d; Time: %+v(total), %+v(copy); streamer: %+v; Lag: %.2fs, State: %s; ETA: %s",
status := fmt.Sprintf("Copy: %d/%d %.1f%%; Applied: %d; Backlog: %d/%d; Time: %+v(total), %+v(copy); streamer: %+v; Lag: %.2fs, HeartbeatLag: %.2fs, State: %s; ETA: %s",
totalRowsCopied, rowsEstimate, progressPct,
atomic.LoadInt64(&this.migrationContext.TotalDMLEventsApplied),
len(this.applyEventsQueue), cap(this.applyEventsQueue),
base.PrettifyDurationOutput(elapsedTime), base.PrettifyDurationOutput(this.migrationContext.ElapsedRowCopyTime()),
currentBinlogCoordinates,
this.migrationContext.GetCurrentLagDuration().Seconds(),
this.migrationContext.TimeSinceLastHeartbeatOnChangelog().Seconds(),
state,
eta,
)
Expand All @@ -995,7 +1023,7 @@ func (this *Migrator) initiateStreaming() error {
this.migrationContext.DatabaseName,
this.migrationContext.GetChangelogTableName(),
func(dmlEvent *binlog.BinlogDMLEvent) error {
return this.onChangelogStateEvent(dmlEvent)
return this.onChangelogEvent(dmlEvent)
},
)

Expand Down