Skip to content

Commit 63d7186

Browse files
authored
fix: randomise the initial grace period to avoid collisions (#240)
* fix: randomise the initial grace period to avoid collisions The previous algorithm was using binary exponential-backoff with a +- 10% jitter to calculate the grace period. Because there can be multiple lambda environments we need to mitigate collisions: We cannot use 0 as the first delay because functions failing closer to each other will collide. The issue would then be propagated by the small jitter for lower delays. This change adds an initial delay of n seconds to the first reconnection attempt. n is randomly generated in a closed interval to account for collisions while keeping in mind usability and user experience. * changelog: add changelog entry
1 parent 9b67c62 commit 63d7186

File tree

4 files changed

+18
-7
lines changed

4 files changed

+18
-7
lines changed

apm-lambda-extension/CHANGELOG.asciidoc

+2-1
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ https://github.com/elastic/apm-aws-lambda/compare/v1.0.2...main[View commits]
3232
- Handle main loop errors correctly {pull}252[252]
3333
- Avoid sending corrupted compressed data to APM Server {pull}257[257]
3434
- Avoid creating http transports on each info request {pull}260[260]
35+
- Randomise the initial grace period to avoid collisions {pull}240[240]
3536
3637
3738
[[release-notes-1.0.2]]
@@ -59,4 +60,4 @@ https://github.com/elastic/apm-aws-lambda/compare/v1.0.0...v1.0.1[View commits]
5960
6061
https://github.com/elastic/apm-aws-lambda/commits/46e65781912ca0448642e1574c1f8162ffa8dec0[View commits]
6162
62-
First stable release of the Elastic AWS Lambda Extension.
63+
First stable release of the Elastic AWS Lambda Extension.

apm-lambda-extension/apmproxy/apmserver.go

+7
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,13 @@ func (c *Client) SetApmServerTransportState(ctx context.Context, status Status)
197197

198198
// ComputeGracePeriod https://github.com/elastic/apm/blob/main/specs/agents/transport.md#transport-errors
199199
func (c *Client) ComputeGracePeriod() time.Duration {
200+
// If reconnectionCount is 0, returns a random number in an interval.
201+
// The grace period for the first reconnection count was 0 but that
202+
// leads to collisions with multiple environments.
203+
if c.ReconnectionCount == 0 {
204+
gracePeriod := rand.Float64() * 5
205+
return time.Duration(gracePeriod * float64(time.Second))
206+
}
200207
gracePeriodWithoutJitter := math.Pow(math.Min(float64(c.ReconnectionCount), 6), 2)
201208
jitter := rand.Float64()/5 - 0.1
202209
return time.Duration((gracePeriodWithoutJitter + jitter*gracePeriodWithoutJitter) * float64(time.Second))

apm-lambda-extension/apmproxy/apmserver_test.go

+6-6
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ func TestGracePeriod(t *testing.T) {
129129

130130
apmClient.ReconnectionCount = 0
131131
val0 := apmClient.ComputeGracePeriod().Seconds()
132-
assert.Equal(t, val0, float64(0))
132+
assert.LessOrEqual(t, val0, 5.0)
133133

134134
apmClient.ReconnectionCount = 1
135135
val1 := apmClient.ComputeGracePeriod().Seconds()
@@ -192,7 +192,7 @@ func TestSetPendingTransport(t *testing.T) {
192192
apmClient.SetApmServerTransportState(context.Background(), apmproxy.Failing)
193193
require.Eventually(t, func() bool {
194194
return apmClient.Status != apmproxy.Failing
195-
}, 1*time.Second, 50*time.Millisecond)
195+
}, 5*time.Second, 50*time.Millisecond)
196196
assert.True(t, apmClient.Status == apmproxy.Pending)
197197
assert.Equal(t, apmClient.ReconnectionCount, 0)
198198
}
@@ -313,7 +313,7 @@ func TestEnterBackoffFromFailing(t *testing.T) {
313313
apmClient.SetApmServerTransportState(context.Background(), apmproxy.Failing)
314314
require.Eventually(t, func() bool {
315315
return apmClient.Status != apmproxy.Failing
316-
}, 1*time.Second, 50*time.Millisecond)
316+
}, 5*time.Second, 50*time.Millisecond)
317317
assert.Equal(t, apmClient.Status, apmproxy.Pending)
318318

319319
assert.Error(t, apmClient.PostToApmServer(context.Background(), agentData))
@@ -366,7 +366,7 @@ func TestAPMServerRecovery(t *testing.T) {
366366
apmClient.SetApmServerTransportState(context.Background(), apmproxy.Failing)
367367
require.Eventually(t, func() bool {
368368
return apmClient.Status != apmproxy.Failing
369-
}, 1*time.Second, 50*time.Millisecond)
369+
}, 5*time.Second, 50*time.Millisecond)
370370
assert.Equal(t, apmClient.Status, apmproxy.Pending)
371371

372372
assert.NoError(t, apmClient.PostToApmServer(context.Background(), agentData))
@@ -410,7 +410,7 @@ func TestAPMServerAuthFails(t *testing.T) {
410410
apmClient.SetApmServerTransportState(context.Background(), apmproxy.Failing)
411411
require.Eventually(t, func() bool {
412412
return apmClient.Status != apmproxy.Failing
413-
}, 1*time.Second, 50*time.Millisecond)
413+
}, 5*time.Second, 50*time.Millisecond)
414414
assert.Equal(t, apmClient.Status, apmproxy.Pending)
415415
assert.NoError(t, apmClient.PostToApmServer(context.Background(), agentData))
416416
assert.NotEqual(t, apmClient.Status, apmproxy.Healthy)
@@ -458,7 +458,7 @@ func TestContinuedAPMServerFailure(t *testing.T) {
458458
apmClient.SetApmServerTransportState(context.Background(), apmproxy.Failing)
459459
require.Eventually(t, func() bool {
460460
return apmClient.Status != apmproxy.Failing
461-
}, 1*time.Second, 50*time.Millisecond)
461+
}, 5*time.Second, 50*time.Millisecond)
462462
assert.Equal(t, apmClient.Status, apmproxy.Pending)
463463
assert.Error(t, apmClient.PostToApmServer(context.Background(), agentData))
464464
assert.Equal(t, apmClient.Status, apmproxy.Failing)

apm-lambda-extension/apmproxy/client.go

+3
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ package apmproxy
2020
import (
2121
"bytes"
2222
"errors"
23+
"math/rand"
2324
"net/http"
2425
"strings"
2526
"sync"
@@ -81,5 +82,7 @@ func NewClient(opts ...Option) (*Client, error) {
8182
c.serverURL = c.serverURL + "/"
8283
}
8384

85+
rand.Seed(time.Now().UnixNano())
86+
8487
return &c, nil
8588
}

0 commit comments

Comments
 (0)