From aeb3b97ee0e759f364c926b40433f8c46274f209 Mon Sep 17 00:00:00 2001
From: Brandur <brandur@brandur.org>
Date: Wed, 3 Jul 2024 22:23:21 -0700
Subject: [PATCH] Add stack trace to error handler's `HandlePanicFunc`

This one in response to #418 in which although we persist a stack trace
to a job row's errors property, we don't reveal it in a panic handler,
which is quite inconvenient for purposes of logging or other telemetry
(e.g. sending to Sentry).

Here, `HandlePanic`'s signature changes to (`trace` is added):

    HandlePanic(ctx context.Context, job *rivertype.JobRow, panicVal any, trace string) *ErrorHandlerResult

A couple notes on choices:

* The naming of `trace` is reused from `AttemptError`.

* The value type is `string`. This is a little non-obvious because Go
  exposes it as a `[]byte`, something I've never quite understood as to
  why, but I did `string` because for one it's more convenient to use,
  but more importantly, it's the same type on `AttemptError`.

This is a breaking change, but it seems like being able to get a stack
trace during panic is important enough that it's worth it, and with any
luck there's few enough people using this feature that it won't break
that many people. The fix is quite easy regardless and will easily be
caught by the compiler.

Fixes #418.
---
 CHANGELOG.md                  | 14 ++++++++++++++
 client_test.go                |  2 +-
 error_handler.go              |  2 +-
 example_error_handler_test.go |  2 +-
 job_executor.go               |  8 ++++----
 job_executor_test.go          | 18 +++++++++++-------
 6 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index cf96ccb2..e9224429 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,20 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - `Config.TestOnly` has been added. It disables various features in the River client like staggered maintenance service start that are useful in production, but may be somewhat harmful in tests because they make start/stop slower. [PR #414](https://github.com/riverqueue/river/pull/414).
 
+### Changed
+
+⚠️ Version 0.9.0 has a small breaking change in `ErrorHandler`. As before, we try never to make breaking changes, but this one was deemed quite important because `ErrorHandler` was fundamentally lacking important functionality.
+
+- **Breaking change:** Add stack trace to `ErrorHandler.HandlePanicFunc`. Fixing code only requires adding a new `trace string` argument to `HandlePanicFunc`. [PR #423](https://github.com/riverqueue/river/pull/423).
+
+    ``` go
+    # before
+    HandlePanic(ctx context.Context, job *rivertype.JobRow, panicVal any) *ErrorHandlerResult
+
+    # after
+    HandlePanic(ctx context.Context, job *rivertype.JobRow, panicVal any, trace string) *ErrorHandlerResult
+    ```
+
 ### Fixed
 
 - Pausing or resuming a queue that was already paused or not paused respectively no longer returns `rivertype.ErrNotFound`. The same goes for pausing or resuming using the all queues string (`*`) when no queues are in the database (previously that also returned `rivertype.ErrNotFound`). [PR #408](https://github.com/riverqueue/river/pull/408).
diff --git a/client_test.go b/client_test.go
index e89a3890..5f2852c2 100644
--- a/client_test.go
+++ b/client_test.go
@@ -2300,7 +2300,7 @@ func Test_Client_ErrorHandler(t *testing.T) {
 
 		var panicHandlerCalled bool
 		config.ErrorHandler = &testErrorHandler{
-			HandlePanicFunc: func(ctx context.Context, job *rivertype.JobRow, panicVal any) *ErrorHandlerResult {
+			HandlePanicFunc: func(ctx context.Context, job *rivertype.JobRow, panicVal any, trace string) *ErrorHandlerResult {
 				require.Equal(t, "panic val", panicVal)
 				panicHandlerCalled = true
 				return &ErrorHandlerResult{}
diff --git a/error_handler.go b/error_handler.go
index 9b6b7050..a80f0233 100644
--- a/error_handler.go
+++ b/error_handler.go
@@ -20,7 +20,7 @@ type ErrorHandler interface {
 	//
 	// Context is descended from the one used to start the River client that
 	// worked the job.
-	HandlePanic(ctx context.Context, job *rivertype.JobRow, panicVal any) *ErrorHandlerResult
+	HandlePanic(ctx context.Context, job *rivertype.JobRow, panicVal any, trace string) *ErrorHandlerResult
 }
 
 type ErrorHandlerResult struct {
diff --git a/example_error_handler_test.go b/example_error_handler_test.go
index 90f8dcf3..d268c150 100644
--- a/example_error_handler_test.go
+++ b/example_error_handler_test.go
@@ -22,7 +22,7 @@ func (*CustomErrorHandler) HandleError(ctx context.Context, job *rivertype.JobRo
 	return nil
 }
 
-func (*CustomErrorHandler) HandlePanic(ctx context.Context, job *rivertype.JobRow, panicVal any) *river.ErrorHandlerResult {
+func (*CustomErrorHandler) HandlePanic(ctx context.Context, job *rivertype.JobRow, panicVal any, trace string) *river.ErrorHandlerResult {
 	fmt.Printf("Job panicked with: %v\n", panicVal)
 
 	// Either function can also set the job to be immediately cancelled.
diff --git a/job_executor.go b/job_executor.go
index 177caab9..400f6e46 100644
--- a/job_executor.go
+++ b/job_executor.go
@@ -100,7 +100,7 @@ var ErrJobCancelledRemotely = JobCancel(errors.New("job cancelled remotely"))
 type jobExecutorResult struct {
 	Err        error
 	NextRetry  time.Time
-	PanicTrace []byte
+	PanicTrace string
 	PanicVal   any
 }
 
@@ -175,7 +175,7 @@ func (e *jobExecutor) execute(ctx context.Context) (res *jobExecutorResult) {
 			)
 
 			res = &jobExecutorResult{
-				PanicTrace: debug.Stack(),
+				PanicTrace: string(debug.Stack()),
 				PanicVal:   recovery,
 			}
 		}
@@ -234,7 +234,7 @@ func (e *jobExecutor) invokeErrorHandler(ctx context.Context, res *jobExecutorRe
 
 	case res.PanicVal != nil:
 		errorHandlerRes = invokeAndHandlePanic("HandlePanic", func() *ErrorHandlerResult {
-			return e.ErrorHandler.HandlePanic(ctx, e.JobRow, res.PanicVal)
+			return e.ErrorHandler.HandlePanic(ctx, e.JobRow, res.PanicVal, res.PanicTrace)
 		})
 	}
 
@@ -316,7 +316,7 @@ func (e *jobExecutor) reportError(ctx context.Context, res *jobExecutorResult) {
 		At:      e.start,
 		Attempt: e.JobRow.Attempt,
 		Error:   res.ErrorStr(),
-		Trace:   string(res.PanicTrace),
+		Trace:   res.PanicTrace,
 	}
 
 	errData, err := json.Marshal(attemptErr)
diff --git a/job_executor_test.go b/job_executor_test.go
index e1ed00b3..101e7e9e 100644
--- a/job_executor_test.go
+++ b/job_executor_test.go
@@ -89,14 +89,16 @@ type testErrorHandler struct {
 	HandleErrorFunc   func(ctx context.Context, job *rivertype.JobRow, err error) *ErrorHandlerResult
 
 	HandlePanicCalled bool
-	HandlePanicFunc   func(ctx context.Context, job *rivertype.JobRow, panicVal any) *ErrorHandlerResult
+	HandlePanicFunc   func(ctx context.Context, job *rivertype.JobRow, panicVal any, trace string) *ErrorHandlerResult
 }
 
 // Test handler with no-ops for both error handling functions.
 func newTestErrorHandler() *testErrorHandler {
 	return &testErrorHandler{
 		HandleErrorFunc: func(ctx context.Context, job *rivertype.JobRow, err error) *ErrorHandlerResult { return nil },
-		HandlePanicFunc: func(ctx context.Context, job *rivertype.JobRow, panicVal any) *ErrorHandlerResult { return nil },
+		HandlePanicFunc: func(ctx context.Context, job *rivertype.JobRow, panicVal any, trace string) *ErrorHandlerResult {
+			return nil
+		},
 	}
 }
 
@@ -105,9 +107,9 @@ func (h *testErrorHandler) HandleError(ctx context.Context, job *rivertype.JobRo
 	return h.HandleErrorFunc(ctx, job, err)
 }
 
-func (h *testErrorHandler) HandlePanic(ctx context.Context, job *rivertype.JobRow, panicVal any) *ErrorHandlerResult {
+func (h *testErrorHandler) HandlePanic(ctx context.Context, job *rivertype.JobRow, panicVal any, trace string) *ErrorHandlerResult {
 	h.HandlePanicCalled = true
-	return h.HandlePanicFunc(ctx, job, panicVal)
+	return h.HandlePanicFunc(ctx, job, panicVal, trace)
 }
 
 func TestJobExecutor_Execute(t *testing.T) {
@@ -565,8 +567,10 @@ func TestJobExecutor_Execute(t *testing.T) {
 		executor, bundle := setup(t)
 
 		executor.WorkUnit = newWorkUnitFactoryWithCustomRetry(func() error { panic("panic val") }, nil).MakeUnit(bundle.jobRow)
-		bundle.errorHandler.HandlePanicFunc = func(ctx context.Context, job *rivertype.JobRow, panicVal any) *ErrorHandlerResult {
+		bundle.errorHandler.HandlePanicFunc = func(ctx context.Context, job *rivertype.JobRow, panicVal any, trace string) *ErrorHandlerResult {
 			require.Equal(t, "panic val", panicVal)
+			require.Contains(t, trace, "runtime/debug.Stack()\n")
+
 			return nil
 		}
 
@@ -586,7 +590,7 @@ func TestJobExecutor_Execute(t *testing.T) {
 		executor, bundle := setup(t)
 
 		executor.WorkUnit = newWorkUnitFactoryWithCustomRetry(func() error { panic("panic val") }, nil).MakeUnit(bundle.jobRow)
-		bundle.errorHandler.HandlePanicFunc = func(ctx context.Context, job *rivertype.JobRow, panicVal any) *ErrorHandlerResult {
+		bundle.errorHandler.HandlePanicFunc = func(ctx context.Context, job *rivertype.JobRow, panicVal any, trace string) *ErrorHandlerResult {
 			return &ErrorHandlerResult{SetCancelled: true}
 		}
 
@@ -606,7 +610,7 @@ func TestJobExecutor_Execute(t *testing.T) {
 		executor, bundle := setup(t)
 
 		executor.WorkUnit = newWorkUnitFactoryWithCustomRetry(func() error { panic("panic val") }, nil).MakeUnit(bundle.jobRow)
-		bundle.errorHandler.HandlePanicFunc = func(ctx context.Context, job *rivertype.JobRow, panicVal any) *ErrorHandlerResult {
+		bundle.errorHandler.HandlePanicFunc = func(ctx context.Context, job *rivertype.JobRow, panicVal any, trace string) *ErrorHandlerResult {
 			panic("panic handler panicked!")
 		}