diff --git a/apm-lambda-extension/CHANGELOG.asciidoc b/apm-lambda-extension/CHANGELOG.asciidoc index 75cd9277..6b802f80 100644 --- a/apm-lambda-extension/CHANGELOG.asciidoc +++ b/apm-lambda-extension/CHANGELOG.asciidoc @@ -32,6 +32,7 @@ https://github.com/elastic/apm-aws-lambda/compare/v1.0.2...main[View commits] - Handle main loop errors correctly {pull}252[252] - Avoid sending corrupted compressed data to APM Server {pull}257[257] - Avoid creating http transports on each info request {pull}260[260] +- Randomise the initial grace period to avoid collisions {pull}240[240] [[release-notes-1.0.2]] @@ -59,4 +60,4 @@ https://github.com/elastic/apm-aws-lambda/compare/v1.0.0...v1.0.1[View commits] https://github.com/elastic/apm-aws-lambda/commits/46e65781912ca0448642e1574c1f8162ffa8dec0[View commits] -First stable release of the Elastic AWS Lambda Extension. \ No newline at end of file +First stable release of the Elastic AWS Lambda Extension. diff --git a/apm-lambda-extension/apmproxy/apmserver.go b/apm-lambda-extension/apmproxy/apmserver.go index 6b394a05..946877bd 100644 --- a/apm-lambda-extension/apmproxy/apmserver.go +++ b/apm-lambda-extension/apmproxy/apmserver.go @@ -197,6 +197,13 @@ func (c *Client) SetApmServerTransportState(ctx context.Context, status Status) // ComputeGracePeriod https://github.com/elastic/apm/blob/main/specs/agents/transport.md#transport-errors func (c *Client) ComputeGracePeriod() time.Duration { + // If reconnectionCount is 0, returns a random number in an interval. + // The grace period for the first reconnection count was 0 but that + // leads to collisions with multiple environments. + if c.ReconnectionCount == 0 { + gracePeriod := rand.Float64() * 5 + return time.Duration(gracePeriod * float64(time.Second)) + } gracePeriodWithoutJitter := math.Pow(math.Min(float64(c.ReconnectionCount), 6), 2) jitter := rand.Float64()/5 - 0.1 return time.Duration((gracePeriodWithoutJitter + jitter*gracePeriodWithoutJitter) * float64(time.Second)) diff --git a/apm-lambda-extension/apmproxy/apmserver_test.go b/apm-lambda-extension/apmproxy/apmserver_test.go index 727c1c0e..32594d51 100644 --- a/apm-lambda-extension/apmproxy/apmserver_test.go +++ b/apm-lambda-extension/apmproxy/apmserver_test.go @@ -129,7 +129,7 @@ func TestGracePeriod(t *testing.T) { apmClient.ReconnectionCount = 0 val0 := apmClient.ComputeGracePeriod().Seconds() - assert.Equal(t, val0, float64(0)) + assert.LessOrEqual(t, val0, 5.0) apmClient.ReconnectionCount = 1 val1 := apmClient.ComputeGracePeriod().Seconds() @@ -192,7 +192,7 @@ func TestSetPendingTransport(t *testing.T) { apmClient.SetApmServerTransportState(context.Background(), apmproxy.Failing) require.Eventually(t, func() bool { return apmClient.Status != apmproxy.Failing - }, 1*time.Second, 50*time.Millisecond) + }, 5*time.Second, 50*time.Millisecond) assert.True(t, apmClient.Status == apmproxy.Pending) assert.Equal(t, apmClient.ReconnectionCount, 0) } @@ -313,7 +313,7 @@ func TestEnterBackoffFromFailing(t *testing.T) { apmClient.SetApmServerTransportState(context.Background(), apmproxy.Failing) require.Eventually(t, func() bool { return apmClient.Status != apmproxy.Failing - }, 1*time.Second, 50*time.Millisecond) + }, 5*time.Second, 50*time.Millisecond) assert.Equal(t, apmClient.Status, apmproxy.Pending) assert.Error(t, apmClient.PostToApmServer(context.Background(), agentData)) @@ -366,7 +366,7 @@ func TestAPMServerRecovery(t *testing.T) { apmClient.SetApmServerTransportState(context.Background(), apmproxy.Failing) require.Eventually(t, func() bool { return apmClient.Status != apmproxy.Failing - }, 1*time.Second, 50*time.Millisecond) + }, 5*time.Second, 50*time.Millisecond) assert.Equal(t, apmClient.Status, apmproxy.Pending) assert.NoError(t, apmClient.PostToApmServer(context.Background(), agentData)) @@ -410,7 +410,7 @@ func TestAPMServerAuthFails(t *testing.T) { apmClient.SetApmServerTransportState(context.Background(), apmproxy.Failing) require.Eventually(t, func() bool { return apmClient.Status != apmproxy.Failing - }, 1*time.Second, 50*time.Millisecond) + }, 5*time.Second, 50*time.Millisecond) assert.Equal(t, apmClient.Status, apmproxy.Pending) assert.NoError(t, apmClient.PostToApmServer(context.Background(), agentData)) assert.NotEqual(t, apmClient.Status, apmproxy.Healthy) @@ -458,7 +458,7 @@ func TestContinuedAPMServerFailure(t *testing.T) { apmClient.SetApmServerTransportState(context.Background(), apmproxy.Failing) require.Eventually(t, func() bool { return apmClient.Status != apmproxy.Failing - }, 1*time.Second, 50*time.Millisecond) + }, 5*time.Second, 50*time.Millisecond) assert.Equal(t, apmClient.Status, apmproxy.Pending) assert.Error(t, apmClient.PostToApmServer(context.Background(), agentData)) assert.Equal(t, apmClient.Status, apmproxy.Failing) diff --git a/apm-lambda-extension/apmproxy/client.go b/apm-lambda-extension/apmproxy/client.go index d9309354..d6830e31 100644 --- a/apm-lambda-extension/apmproxy/client.go +++ b/apm-lambda-extension/apmproxy/client.go @@ -20,6 +20,7 @@ package apmproxy import ( "bytes" "errors" + "math/rand" "net/http" "strings" "sync" @@ -81,5 +82,7 @@ func NewClient(opts ...Option) (*Client, error) { c.serverURL = c.serverURL + "/" } + rand.Seed(time.Now().UnixNano()) + return &c, nil }