Skip to content

Commit 33b2471

Browse files
committed
fix: randomise the initial grace period to avoid collisions
The previous algorithm was using binary exponential-backoff with a +- 10% jitter to calculate the grace period. Because there can be multiple lambda environments we need to mitigate collisions: We cannot use 0 as the first delay because functions failing closer to each other will collide. The issue would then be propagated by the small jitter for lower delays. This change adds an initial delay of n seconds to the first reconnection attempt. n is randomly generated in a closed interval to account for collisions while keeping in mind usability and user experience.
1 parent 33092f4 commit 33b2471

File tree

3 files changed

+16
-6
lines changed

3 files changed

+16
-6
lines changed

apm-lambda-extension/apmproxy/apmserver.go

+7
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,13 @@ func (c *Client) SetApmServerTransportState(ctx context.Context, status Status)
197197

198198
// ComputeGracePeriod https://github.com/elastic/apm/blob/main/specs/agents/transport.md#transport-errors
199199
func (c *Client) ComputeGracePeriod() time.Duration {
200+
// If reconnectionCount is 0, returns a random number in an interval.
201+
// The grace period for the first reconnection count was 0 but that
202+
// leads to collisions with multiple environments.
203+
if c.ReconnectionCount == 0 {
204+
gracePeriod := rand.Float64() * 5
205+
return time.Duration(gracePeriod * float64(time.Second))
206+
}
200207
gracePeriodWithoutJitter := math.Pow(math.Min(float64(c.ReconnectionCount), 6), 2)
201208
jitter := rand.Float64()/5 - 0.1
202209
return time.Duration((gracePeriodWithoutJitter + jitter*gracePeriodWithoutJitter) * float64(time.Second))

apm-lambda-extension/apmproxy/apmserver_test.go

+6-6
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ func TestGracePeriod(t *testing.T) {
129129

130130
apmClient.ReconnectionCount = 0
131131
val0 := apmClient.ComputeGracePeriod().Seconds()
132-
assert.Equal(t, val0, float64(0))
132+
assert.LessOrEqual(t, val0, 5.0)
133133

134134
apmClient.ReconnectionCount = 1
135135
val1 := apmClient.ComputeGracePeriod().Seconds()
@@ -192,7 +192,7 @@ func TestSetPendingTransport(t *testing.T) {
192192
apmClient.SetApmServerTransportState(context.Background(), apmproxy.Failing)
193193
require.Eventually(t, func() bool {
194194
return apmClient.Status != apmproxy.Failing
195-
}, 1*time.Second, 50*time.Millisecond)
195+
}, 5*time.Second, 50*time.Millisecond)
196196
assert.True(t, apmClient.Status == apmproxy.Pending)
197197
assert.Equal(t, apmClient.ReconnectionCount, 0)
198198
}
@@ -313,7 +313,7 @@ func TestEnterBackoffFromFailing(t *testing.T) {
313313
apmClient.SetApmServerTransportState(context.Background(), apmproxy.Failing)
314314
require.Eventually(t, func() bool {
315315
return apmClient.Status != apmproxy.Failing
316-
}, 1*time.Second, 50*time.Millisecond)
316+
}, 5*time.Second, 50*time.Millisecond)
317317
assert.Equal(t, apmClient.Status, apmproxy.Pending)
318318

319319
assert.Error(t, apmClient.PostToApmServer(context.Background(), agentData))
@@ -366,7 +366,7 @@ func TestAPMServerRecovery(t *testing.T) {
366366
apmClient.SetApmServerTransportState(context.Background(), apmproxy.Failing)
367367
require.Eventually(t, func() bool {
368368
return apmClient.Status != apmproxy.Failing
369-
}, 1*time.Second, 50*time.Millisecond)
369+
}, 5*time.Second, 50*time.Millisecond)
370370
assert.Equal(t, apmClient.Status, apmproxy.Pending)
371371

372372
assert.NoError(t, apmClient.PostToApmServer(context.Background(), agentData))
@@ -410,7 +410,7 @@ func TestAPMServerAuthFails(t *testing.T) {
410410
apmClient.SetApmServerTransportState(context.Background(), apmproxy.Failing)
411411
require.Eventually(t, func() bool {
412412
return apmClient.Status != apmproxy.Failing
413-
}, 1*time.Second, 50*time.Millisecond)
413+
}, 5*time.Second, 50*time.Millisecond)
414414
assert.Equal(t, apmClient.Status, apmproxy.Pending)
415415
assert.NoError(t, apmClient.PostToApmServer(context.Background(), agentData))
416416
assert.NotEqual(t, apmClient.Status, apmproxy.Healthy)
@@ -458,7 +458,7 @@ func TestContinuedAPMServerFailure(t *testing.T) {
458458
apmClient.SetApmServerTransportState(context.Background(), apmproxy.Failing)
459459
require.Eventually(t, func() bool {
460460
return apmClient.Status != apmproxy.Failing
461-
}, 1*time.Second, 50*time.Millisecond)
461+
}, 5*time.Second, 50*time.Millisecond)
462462
assert.Equal(t, apmClient.Status, apmproxy.Pending)
463463
assert.Error(t, apmClient.PostToApmServer(context.Background(), agentData))
464464
assert.Equal(t, apmClient.Status, apmproxy.Failing)

apm-lambda-extension/apmproxy/client.go

+3
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ package apmproxy
2020
import (
2121
"bytes"
2222
"errors"
23+
"math/rand"
2324
"net/http"
2425
"strings"
2526
"sync"
@@ -81,5 +82,7 @@ func NewClient(opts ...Option) (*Client, error) {
8182
c.serverURL = c.serverURL + "/"
8283
}
8384

85+
rand.Seed(time.Now().UnixNano())
86+
8487
return &c, nil
8588
}

0 commit comments

Comments
 (0)