Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve retry mechanism for transactions when WriteConflict error #27

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 16 additions & 15 deletions sstxn/sstxn.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
package sstxn

import (
"context"
"errors"
"fmt"

Expand Down Expand Up @@ -54,6 +55,8 @@ func (nilLogger) Criticalf(message string, args ...interface{}) {}

var _ Logger = nilLogger{}

var ErrTimeout = fmt.Errorf("transaction failed after retrying for 120 seconds")

// A Runner applies operations as part of a transaction onto any number
// of collections within a database. See the Run method for details.
type Runner struct {
Expand Down Expand Up @@ -105,7 +108,7 @@ func NewRunner(db *mgo.Database, logger Logger) *Runner {
//
// Any number of transactions may be run concurrently, with one
// runner or many.
func (r *Runner) Run(ops []txn.Op, id bson.ObjectId, info interface{}) (err error) {
func (r *Runner) Run(ctx context.Context, ops []txn.Op, id bson.ObjectId, info interface{}) (err error) {
const efmt = "error in transaction op %d: %s"
for i := range ops {
op := &ops[i]
Expand Down Expand Up @@ -133,21 +136,19 @@ func (r *Runner) Run(ops []txn.Op, id bson.ObjectId, info interface{}) (err erro
id = bson.NewObjectId()
}

// Sometimes the mongo server will return an error code 112 (write conflict).
// This is a signal the transaction needs to be retried.
// We'll retry 3 times but not forever.
for i := 0; i < 3; i++ {
err = r.runTxn(ops, id)
if err == errWriteConflict {
r.logger.Tracef("attempt %d retrying txn ops", i)
continue
for {
err := r.runTxn(ops, id)
if err != errWriteConflict {
return err
}
break
}
if err == errWriteConflict {
err = txn.ErrAborted
select {
case <-ctx.Done():
r.logger.Debugf("transaction cancelled by caller or timeout reached, ops '%+v'", ops)
return ctx.Err()
default:
}
r.logger.Tracef("retrying txn ops '%+v'", ops)
}
return err
}

func (r *Runner) runTxn(ops []txn.Op, id bson.ObjectId) error {
Expand Down Expand Up @@ -474,7 +475,7 @@ func (r *Runner) updateLog(ops []txn.Op, revnos []int64, txnId bson.ObjectId) er
//
// Saved documents are in the format:
//
// {"_id": <txn id>, <collection>: {"d": [<doc id>, ...], "r": [<doc revno>, ...]}}
// {"_id": <txn id>, <collection>: {"d": [<doc id>, ...], "r": [<doc revno>, ...]}}
//
// The document revision is the value of the txn-revno field after
// the change has been applied. Negative values indicate the document
Expand Down
Loading