From 33b247151f9117fa2898a5a45318489592ada337 Mon Sep 17 00:00:00 2001 From: kruskal <99559985+kruskall@users.noreply.github.com> Date: Sun, 17 Jul 2022 20:01:52 +0200 Subject: [PATCH 1/2] fix: randomise the initial grace period to avoid collisions The previous algorithm was using binary exponential-backoff with a +- 10% jitter to calculate the grace period. Because there can be multiple lambda environments we need to mitigate collisions: We cannot use 0 as the first delay because functions failing closer to each other will collide. The issue would then be propagated by the small jitter for lower delays. This change adds an initial delay of n seconds to the first reconnection attempt. n is randomly generated in a closed interval to account for collisions while keeping in mind usability and user experience. --- apm-lambda-extension/apmproxy/apmserver.go | 7 +++++++ apm-lambda-extension/apmproxy/apmserver_test.go | 12 ++++++------ apm-lambda-extension/apmproxy/client.go | 3 +++ 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/apm-lambda-extension/apmproxy/apmserver.go b/apm-lambda-extension/apmproxy/apmserver.go index 6b394a05..946877bd 100644 --- a/apm-lambda-extension/apmproxy/apmserver.go +++ b/apm-lambda-extension/apmproxy/apmserver.go @@ -197,6 +197,13 @@ func (c *Client) SetApmServerTransportState(ctx context.Context, status Status) // ComputeGracePeriod https://github.com/elastic/apm/blob/main/specs/agents/transport.md#transport-errors func (c *Client) ComputeGracePeriod() time.Duration { + // If reconnectionCount is 0, returns a random number in an interval. + // The grace period for the first reconnection count was 0 but that + // leads to collisions with multiple environments. + if c.ReconnectionCount == 0 { + gracePeriod := rand.Float64() * 5 + return time.Duration(gracePeriod * float64(time.Second)) + } gracePeriodWithoutJitter := math.Pow(math.Min(float64(c.ReconnectionCount), 6), 2) jitter := rand.Float64()/5 - 0.1 return time.Duration((gracePeriodWithoutJitter + jitter*gracePeriodWithoutJitter) * float64(time.Second)) diff --git a/apm-lambda-extension/apmproxy/apmserver_test.go b/apm-lambda-extension/apmproxy/apmserver_test.go index 727c1c0e..32594d51 100644 --- a/apm-lambda-extension/apmproxy/apmserver_test.go +++ b/apm-lambda-extension/apmproxy/apmserver_test.go @@ -129,7 +129,7 @@ func TestGracePeriod(t *testing.T) { apmClient.ReconnectionCount = 0 val0 := apmClient.ComputeGracePeriod().Seconds() - assert.Equal(t, val0, float64(0)) + assert.LessOrEqual(t, val0, 5.0) apmClient.ReconnectionCount = 1 val1 := apmClient.ComputeGracePeriod().Seconds() @@ -192,7 +192,7 @@ func TestSetPendingTransport(t *testing.T) { apmClient.SetApmServerTransportState(context.Background(), apmproxy.Failing) require.Eventually(t, func() bool { return apmClient.Status != apmproxy.Failing - }, 1*time.Second, 50*time.Millisecond) + }, 5*time.Second, 50*time.Millisecond) assert.True(t, apmClient.Status == apmproxy.Pending) assert.Equal(t, apmClient.ReconnectionCount, 0) } @@ -313,7 +313,7 @@ func TestEnterBackoffFromFailing(t *testing.T) { apmClient.SetApmServerTransportState(context.Background(), apmproxy.Failing) require.Eventually(t, func() bool { return apmClient.Status != apmproxy.Failing - }, 1*time.Second, 50*time.Millisecond) + }, 5*time.Second, 50*time.Millisecond) assert.Equal(t, apmClient.Status, apmproxy.Pending) assert.Error(t, apmClient.PostToApmServer(context.Background(), agentData)) @@ -366,7 +366,7 @@ func TestAPMServerRecovery(t *testing.T) { apmClient.SetApmServerTransportState(context.Background(), apmproxy.Failing) require.Eventually(t, func() bool { return apmClient.Status != apmproxy.Failing - }, 1*time.Second, 50*time.Millisecond) + }, 5*time.Second, 50*time.Millisecond) assert.Equal(t, apmClient.Status, apmproxy.Pending) assert.NoError(t, apmClient.PostToApmServer(context.Background(), agentData)) @@ -410,7 +410,7 @@ func TestAPMServerAuthFails(t *testing.T) { apmClient.SetApmServerTransportState(context.Background(), apmproxy.Failing) require.Eventually(t, func() bool { return apmClient.Status != apmproxy.Failing - }, 1*time.Second, 50*time.Millisecond) + }, 5*time.Second, 50*time.Millisecond) assert.Equal(t, apmClient.Status, apmproxy.Pending) assert.NoError(t, apmClient.PostToApmServer(context.Background(), agentData)) assert.NotEqual(t, apmClient.Status, apmproxy.Healthy) @@ -458,7 +458,7 @@ func TestContinuedAPMServerFailure(t *testing.T) { apmClient.SetApmServerTransportState(context.Background(), apmproxy.Failing) require.Eventually(t, func() bool { return apmClient.Status != apmproxy.Failing - }, 1*time.Second, 50*time.Millisecond) + }, 5*time.Second, 50*time.Millisecond) assert.Equal(t, apmClient.Status, apmproxy.Pending) assert.Error(t, apmClient.PostToApmServer(context.Background(), agentData)) assert.Equal(t, apmClient.Status, apmproxy.Failing) diff --git a/apm-lambda-extension/apmproxy/client.go b/apm-lambda-extension/apmproxy/client.go index d9309354..d6830e31 100644 --- a/apm-lambda-extension/apmproxy/client.go +++ b/apm-lambda-extension/apmproxy/client.go @@ -20,6 +20,7 @@ package apmproxy import ( "bytes" "errors" + "math/rand" "net/http" "strings" "sync" @@ -81,5 +82,7 @@ func NewClient(opts ...Option) (*Client, error) { c.serverURL = c.serverURL + "/" } + rand.Seed(time.Now().UnixNano()) + return &c, nil } From a34a8eeeaa11044ab537087031059d78e892d38e Mon Sep 17 00:00:00 2001 From: kruskal <99559985+kruskall@users.noreply.github.com> Date: Wed, 3 Aug 2022 21:38:51 +0200 Subject: [PATCH 2/2] changelog: add changelog entry --- apm-lambda-extension/CHANGELOG.asciidoc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/apm-lambda-extension/CHANGELOG.asciidoc b/apm-lambda-extension/CHANGELOG.asciidoc index 75cd9277..6b802f80 100644 --- a/apm-lambda-extension/CHANGELOG.asciidoc +++ b/apm-lambda-extension/CHANGELOG.asciidoc @@ -32,6 +32,7 @@ https://github.com/elastic/apm-aws-lambda/compare/v1.0.2...main[View commits] - Handle main loop errors correctly {pull}252[252] - Avoid sending corrupted compressed data to APM Server {pull}257[257] - Avoid creating http transports on each info request {pull}260[260] +- Randomise the initial grace period to avoid collisions {pull}240[240] [[release-notes-1.0.2]] @@ -59,4 +60,4 @@ https://github.com/elastic/apm-aws-lambda/compare/v1.0.0...v1.0.1[View commits] https://github.com/elastic/apm-aws-lambda/commits/46e65781912ca0448642e1574c1f8162ffa8dec0[View commits] -First stable release of the Elastic AWS Lambda Extension. \ No newline at end of file +First stable release of the Elastic AWS Lambda Extension.