-
Notifications
You must be signed in to change notification settings - Fork 2.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
SwitchTraffic: use separate context while canceling a migration #17340
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1135,30 +1135,46 @@ func (ts *trafficSwitcher) switchDeniedTables(ctx context.Context) error { | |
return nil | ||
} | ||
|
||
// cancelMigration attempts to revert all changes made during the migration so that we can get back to the | ||
// state when traffic switching (or reversing) was initiated. | ||
func (ts *trafficSwitcher) cancelMigration(ctx context.Context, sm *StreamMigrator) { | ||
var err error | ||
|
||
if ctx.Err() != nil { | ||
// We are creating a new context for cancelMigration, but we still record any error, | ||
// for any forensics in case of failures, to help determine whether the migration is being | ||
// cancelled due to a client timeout or some other reason. | ||
ts.Logger().Infof("In Cancel migration: original context invalid: %s", ctx.Err()) | ||
} | ||
|
||
// We create a new context while canceling the migration, so that we are independent of the original | ||
// context being cancelled prior to or during the cancel operation. | ||
cmTimeout := 60 * time.Second | ||
cmCtx, cmCancel := context.WithTimeout(context.Background(), cmTimeout) | ||
defer cmCancel() | ||
|
||
if ts.MigrationType() == binlogdatapb.MigrationType_TABLES { | ||
err = ts.switchDeniedTables(ctx) | ||
err = ts.switchDeniedTables(cmCtx) | ||
} else { | ||
err = ts.changeShardsAccess(ctx, ts.SourceKeyspaceName(), ts.SourceShards(), allowWrites) | ||
err = ts.changeShardsAccess(cmCtx, ts.SourceKeyspaceName(), ts.SourceShards(), allowWrites) | ||
} | ||
if err != nil { | ||
ts.Logger().Errorf("Cancel migration failed: %v", err) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IMO we should take this opportunity to improve the log message:
I also think that we should accumulate these and return them to the caller. But we could defer that change. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated the log message. Let's make the changes to pass them to the caller separately since we are not returning the error at the moment. |
||
} | ||
|
||
sm.CancelStreamMigrations(ctx) | ||
sm.CancelStreamMigrations(cmCtx) | ||
|
||
err = ts.ForAllTargets(func(target *MigrationTarget) error { | ||
query := fmt.Sprintf("update _vt.vreplication set state='Running', message='' where db_name=%s and workflow=%s", | ||
encodeString(target.GetPrimary().DbName()), encodeString(ts.WorkflowName())) | ||
_, err := ts.TabletManagerClient().VReplicationExec(ctx, target.GetPrimary().Tablet, query) | ||
_, err := ts.TabletManagerClient().VReplicationExec(cmCtx, target.GetPrimary().Tablet, query) | ||
return err | ||
}) | ||
if err != nil { | ||
ts.Logger().Errorf("Cancel migration failed: could not restart vreplication: %v", err) | ||
} | ||
|
||
err = ts.deleteReverseVReplication(ctx) | ||
err = ts.deleteReverseVReplication(cmCtx) | ||
if err != nil { | ||
ts.Logger().Errorf("Cancel migration failed: could not delete reverse vreplication streams: %v", err) | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It probably does not matter logically, but I'm often wondering whether semantically
context.WithoutCancel
would be more accurate in this situation?It probably makes no difference as we're not using values stored on the context.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is possible that we get here when the
ctx
is already cancelled. In the specific case which lead us to this issue the client had a short timeout and original context's deadline had already expired.