fix StopAndTest after ongoing Stop

This regression was introduced in #272, which I noticed because our demo app is not currently cancelling active jobs gracefully before exiting. This commit fixes the behavior by always cancelling the work context upon `StopAndCancel()`, regardless of whether or not another shutdown is in progress.
riverqueue · May 30, 2024 · 23a3960 · 23a3960
1 parent 9fab070
commit 23a3960
Show file tree

Hide file tree

Showing 3 changed files with 73 additions and 27 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Fixed
+
+- Fix `StopAndCancel` to not hang if called in parallel to an ongoing `Stop` call. [PR #376](https://github.com/riverqueue/river/pull/376).
+
 ## [0.6.1] - 2024-05-21
 
 ### Fixed

diff --git a/client.go b/client.go
@@ -796,14 +796,14 @@ func (c *Client[TTx]) Stop(ctx context.Context) error {
 // no need to call this method if the context passed to Run is cancelled
 // instead.
 func (c *Client[TTx]) StopAndCancel(ctx context.Context) error {
+	c.baseService.Logger.InfoContext(ctx, c.baseService.Name+": Hard stop started; cancelling all work")
+	c.workCancel(rivercommon.ErrShutdown)
+
 	shouldStop, stopped, finalizeStop := c.baseStartStop.StopInit()
 	if !shouldStop {
 		return nil
 	}
 
-	c.baseService.Logger.InfoContext(ctx, c.baseService.Name+": Hard stop started; cancelling all work")
-	c.workCancel(rivercommon.ErrShutdown)
-
 	select {
 	case <-ctx.Done(): // stop context cancelled
 		finalizeStop(false) // not stopped; allow Stop to be called again

diff --git a/client_test.go b/client_test.go
@@ -644,39 +644,81 @@ func Test_Client(t *testing.T) {
 	t.Run("StopAndCancel", func(t *testing.T) {
 		t.Parallel()
 
-		client, _ := setup(t)
-		jobStartedChan := make(chan int64)
-		jobDoneChan := make(chan struct{})
-
-		type JobArgs struct {
-			JobArgsReflectKind[JobArgs]
+		type testBundle struct {
+			jobDoneChan    chan struct{}
+			jobStartedChan chan int64
 		}
 
-		AddWorker(client.config.Workers, WorkFunc(func(ctx context.Context, job *Job[JobArgs]) error {
-			jobStartedChan <- job.ID
-			<-ctx.Done()
-			require.ErrorIs(t, context.Cause(ctx), rivercommon.ErrShutdown)
-			close(jobDoneChan)
-			return nil
-		}))
+		setupStopAndCancel := func(t *testing.T) (*Client[pgx.Tx], *testBundle) {
+			client, _ := setup(t)
+			jobStartedChan := make(chan int64)
+			jobDoneChan := make(chan struct{})
 
-		startClient(ctx, t, client)
+			type JobArgs struct {
+				JobArgsReflectKind[JobArgs]
+			}
 
-		insertRes, err := client.Insert(ctx, &JobArgs{}, nil)
-		require.NoError(t, err)
+			AddWorker(client.config.Workers, WorkFunc(func(ctx context.Context, job *Job[JobArgs]) error {
+				jobStartedChan <- job.ID
+				<-ctx.Done()
+				require.ErrorIs(t, context.Cause(ctx), rivercommon.ErrShutdown)
+				close(jobDoneChan)
+				return nil
+			}))
 
-		startedJobID := riverinternaltest.WaitOrTimeout(t, jobStartedChan)
-		require.Equal(t, insertRes.Job.ID, startedJobID)
+			startClient(ctx, t, client)
 
-		select {
-		case <-client.Stopped():
-			t.Fatal("expected client to not be stopped yet")
-		default:
+			insertRes, err := client.Insert(ctx, &JobArgs{}, nil)
+			require.NoError(t, err)
+
+			startedJobID := riverinternaltest.WaitOrTimeout(t, jobStartedChan)
+			require.Equal(t, insertRes.Job.ID, startedJobID)
+
+			select {
+			case <-client.Stopped():
+				t.Fatal("expected client to not be stopped yet")
+			default:
+			}
+
+			return client, &testBundle{
+				jobDoneChan:    jobDoneChan,
+				jobStartedChan: jobStartedChan,
+			}
 		}
 
-		require.NoError(t, client.StopAndCancel(ctx))
+		t.Run("OnItsOwn", func(t *testing.T) {
+			t.Parallel()
+
+			client, _ := setupStopAndCancel(t)
+
+			require.NoError(t, client.StopAndCancel(ctx))
+			riverinternaltest.WaitOrTimeout(t, client.Stopped())
+		})
+
+		t.Run("AfterStop", func(t *testing.T) {
+			t.Parallel()
+
+			client, bundle := setupStopAndCancel(t)
 
-		riverinternaltest.WaitOrTimeout(t, client.Stopped())
+			go func() {
+				require.NoError(t, client.Stop(ctx))
+			}()
+
+			select {
+			case <-client.Stopped():
+				t.Fatal("expected client to not be stopped yet")
+			case <-time.After(500 * time.Millisecond):
+			}
+
+			require.NoError(t, client.StopAndCancel(ctx))
+			riverinternaltest.WaitOrTimeout(t, client.Stopped())
+
+			select {
+			case <-bundle.jobDoneChan:
+			default:
+				t.Fatal("expected job to be have exited")
+			}
+		})
 	})
 }