Skip to content

Commit c7314be

Browse files
committed
clang: improve general performance with vectorization/unrolling
Clang has a tendency to *heavily* unroll loops all over the place: llvm/llvm-project#42332 Disable loop unrolling wherever it goes too nuts, enable vectorization where it doesn't do so automatically, etc. Signed-off-by: Steven Noonan <steven@uplinklabs.net>
1 parent 5a6e062 commit c7314be

File tree

1 file changed

+54
-30
lines changed

1 file changed

+54
-30
lines changed

miniaudio.h

Lines changed: 54 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -42864,7 +42864,7 @@ MA_API void ma_copy_and_apply_volume_factor_s32(ma_int32* pSamplesOut, const ma_
4286442864
}
4286542865
}
4286642866

42867-
MA_API void ma_copy_and_apply_volume_factor_f32(float* pSamplesOut, const float* pSamplesIn, ma_uint64 sampleCount, float factor)
42867+
MA_API void ma_copy_and_apply_volume_factor_f32(float* MA_RESTRICT pSamplesOut, const float* MA_RESTRICT pSamplesIn, ma_uint64 sampleCount, float factor)
4286842868
{
4286942869
ma_uint64 iSample;
4287042870

@@ -43159,10 +43159,12 @@ MA_API ma_result ma_mix_pcm_frames_f32(float* pDst, const float* pSrc, ma_uint64
4315943159
sampleCount = frameCount * channels;
4316043160

4316143161
if (volume == 1) {
43162+
#pragma clang loop vectorize(enable)
4316243163
for (iSample = 0; iSample < sampleCount; iSample += 1) {
4316343164
pDst[iSample] += pSrc[iSample];
4316443165
}
4316543166
} else {
43167+
#pragma clang loop vectorize(enable)
4316643168
for (iSample = 0; iSample < sampleCount; iSample += 1) {
4316743169
pDst[iSample] += ma_apply_volume_unclipped_f32(pSrc[iSample], volume);
4316843170
}
@@ -45463,7 +45465,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_f32__direct_form_2_transposed(
4546345465
const float a1 = pBQ->a1.f32;
4546445466
const float a2 = pBQ->a2.f32;
4546545467

45466-
MA_ASSUME(channels > 0);
45468+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45469+
#pragma clang loop unroll(disable)
4546745470
for (c = 0; c < channels; c += 1) {
4546845471
float r1 = pBQ->pR1[c].f32;
4546945472
float r2 = pBQ->pR2[c].f32;
@@ -45495,7 +45498,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_s16__direct_form_2_transposed(
4549545498
const ma_int32 a1 = pBQ->a1.s32;
4549645499
const ma_int32 a2 = pBQ->a2.s32;
4549745500

45498-
MA_ASSUME(channels > 0);
45501+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45502+
#pragma clang loop unroll(disable)
4549945503
for (c = 0; c < channels; c += 1) {
4550045504
ma_int32 r1 = pBQ->pR1[c].s32;
4550145505
ma_int32 r2 = pBQ->pR2[c].s32;
@@ -45769,22 +45773,23 @@ MA_API ma_result ma_lpf1_clear_cache(ma_lpf1* pLPF)
4576945773
return MA_SUCCESS;
4577045774
}
4577145775

45772-
static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1* pLPF, float* pY, const float* pX)
45776+
static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1 *pLPF, float *pY, const float *pX)
4577345777
{
4577445778
ma_uint32 c;
4577545779
const ma_uint32 channels = pLPF->channels;
4577645780
const float a = pLPF->a.f32;
4577745781
const float b = 1 - a;
4577845782

45779-
MA_ASSUME(channels > 0);
45783+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45784+
#pragma clang loop unroll(disable)
4578045785
for (c = 0; c < channels; c += 1) {
4578145786
float r1 = pLPF->pR1[c].f32;
45782-
float x = pX[c];
45787+
float x = pX[c];
4578345788
float y;
4578445789

45785-
y = b*x + a*r1;
45790+
y = b * x + a * r1;
4578645791

45787-
pY[c] = y;
45792+
pY[c] = y;
4578845793
pLPF->pR1[c].f32 = y;
4578945794
}
4579045795
}
@@ -45796,7 +45801,8 @@ static MA_INLINE void ma_lpf1_process_pcm_frame_s16(ma_lpf1* pLPF, ma_int16* pY,
4579645801
const ma_int32 a = pLPF->a.s32;
4579745802
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
4579845803

45799-
MA_ASSUME(channels > 0);
45804+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45805+
#pragma clang loop unroll(disable)
4580045806
for (c = 0; c < channels; c += 1) {
4580145807
ma_int32 r1 = pLPF->pR1[c].s32;
4580245808
ma_int32 x = pX[c];
@@ -46649,7 +46655,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_f32(ma_hpf1* pHPF, float* pY, co
4664946655
const float a = 1 - pHPF->a.f32;
4665046656
const float b = 1 - a;
4665146657

46652-
MA_ASSUME(channels > 0);
46658+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
4665346659
for (c = 0; c < channels; c += 1) {
4665446660
float r1 = pHPF->pR1[c].f32;
4665546661
float x = pX[c];
@@ -46669,7 +46675,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_s16(ma_hpf1* pHPF, ma_int16* pY,
4666946675
const ma_int32 a = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - pHPF->a.s32);
4667046676
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
4667146677

46672-
MA_ASSUME(channels > 0);
46678+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
4667346679
for (c = 0; c < channels; c += 1) {
4667446680
ma_int32 r1 = pHPF->pR1[c].s32;
4667546681
ma_int32 x = pX[c];
@@ -48777,6 +48783,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4877748783
ma_uint64 iFrame;
4877848784
ma_uint32 iChannel;
4877948785
ma_uint64 interpolatedFrameCount;
48786+
const ma_uint32 channels = pGainer->config.channels;
4878048787

4878148788
MA_ASSERT(pGainer != NULL);
4878248789

@@ -48816,12 +48823,12 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4881648823
float a = (float)pGainer->t / pGainer->config.smoothTimeInFrames;
4881748824
float d = 1.0f / pGainer->config.smoothTimeInFrames;
4881848825

48819-
if (pGainer->config.channels <= 32) {
48826+
if (channels <= 32) {
4882048827
float pRunningGain[32];
4882148828
float pRunningGainDelta[32]; /* Could this be heap-allocated as part of the ma_gainer object? */
4882248829

4882348830
/* Initialize the running gain. */
48824-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48831+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
4882548832
float t = (pGainer->pNewGains[iChannel] - pGainer->pOldGains[iChannel]) * pGainer->masterVolume;
4882648833
pRunningGainDelta[iChannel] = t * d;
4882748834
pRunningGain[iChannel] = (pGainer->pOldGains[iChannel] * pGainer->masterVolume) + (t * a);
@@ -48830,7 +48837,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4883048837
iFrame = 0;
4883148838

4883248839
/* Optimized paths for common channel counts. This is mostly just experimenting with some SIMD ideas. It's not necessarily final. */
48833-
if (pGainer->config.channels == 2) {
48840+
if (channels == 2) {
4883448841
#if defined(MA_SUPPORT_SSE2)
4883548842
if (ma_has_sse2()) {
4883648843
ma_uint64 unrolledLoopCount = interpolatedFrameCount >> 1;
@@ -48878,6 +48885,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4887848885

4887948886
iFrame = unrolledLoopCount << 1;
4888048887
#else
48888+
#pragma clang loop vectorize(enable)
4888148889
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
4888248890
for (iChannel = 0; iChannel < 2; iChannel += 1) {
4888348891
pFramesOutF32[iFrame*2 + iChannel] = pFramesInF32[iFrame*2 + iChannel] * pRunningGain[iChannel];
@@ -48889,7 +48897,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4888948897
}
4889048898
#endif
4889148899
}
48892-
} else if (pGainer->config.channels == 6) {
48900+
} else if (channels == 6) {
4889348901
#if defined(MA_SUPPORT_SSE2)
4889448902
if (ma_has_sse2()) {
4889548903
/*
@@ -48922,6 +48930,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4892248930
} else
4892348931
#endif
4892448932
{
48933+
#pragma clang loop vectorize(enable)
4892548934
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
4892648935
for (iChannel = 0; iChannel < 6; iChannel += 1) {
4892748936
pFramesOutF32[iFrame*6 + iChannel] = pFramesInF32[iFrame*6 + iChannel] * pRunningGain[iChannel];
@@ -48933,7 +48942,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4893348942
}
4893448943
}
4893548944
}
48936-
} else if (pGainer->config.channels == 8) {
48945+
} else if (channels == 8) {
4893748946
/* For 8 channels we can just go over frame by frame and do all eight channels as 2 separate 4x SIMD operations. */
4893848947
#if defined(MA_SUPPORT_SSE2)
4893948948
if (ma_has_sse2()) {
@@ -48953,6 +48962,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4895348962
#endif
4895448963
{
4895548964
/* This is crafted so that it auto-vectorizes when compiled with Clang. */
48965+
#pragma clang loop vectorize(enable)
4895648966
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
4895748967
for (iChannel = 0; iChannel < 8; iChannel += 1) {
4895848968
pFramesOutF32[iFrame*8 + iChannel] = pFramesInF32[iFrame*8 + iChannel] * pRunningGain[iChannel];
@@ -48966,17 +48976,21 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4896648976
}
4896748977
}
4896848978

48979+
#pragma clang loop unroll(disable)
4896948980
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48970-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48971-
pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * pRunningGain[iChannel];
48981+
#pragma clang loop vectorize(enable)
48982+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
48983+
pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * pRunningGain[iChannel];
4897248984
pRunningGain[iChannel] += pRunningGainDelta[iChannel];
4897348985
}
4897448986
}
4897548987
} else {
4897648988
/* Slower path for extreme channel counts where we can't fit enough on the stack. We could also move this to the heap as part of the ma_gainer object which might even be better since it'll only be updated when the gains actually change. */
48989+
#pragma clang loop unroll(disable)
4897748990
for (iFrame = 0; iFrame < interpolatedFrameCount; iFrame += 1) {
48978-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48979-
pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
48991+
#pragma clang loop vectorize(enable)
48992+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
48993+
pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
4898048994
}
4898148995

4898248996
a += d;
@@ -48995,18 +49009,21 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4899549009

4899649010
/* All we need to do here is apply the new gains using an optimized path. */
4899749011
if (pFramesOut != NULL && pFramesIn != NULL) {
48998-
if (pGainer->config.channels <= 32) {
49012+
if (channels <= 32) {
4899949013
float gains[32];
49000-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
49014+
#pragma clang loop unroll(disable)
49015+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
4900149016
gains[iChannel] = pGainer->pNewGains[iChannel] * pGainer->masterVolume;
4900249017
}
4900349018

49004-
ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, pGainer->config.channels, gains);
49019+
ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, channels, gains);
4900549020
} else {
4900649021
/* Slow path. Too many channels to fit on the stack. Need to apply a master volume as a separate path. */
49022+
#pragma clang loop unroll(disable)
4900749023
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
49008-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
49009-
((float*)pFramesOut)[iFrame*pGainer->config.channels + iChannel] = ((const float*)pFramesIn)[iFrame*pGainer->config.channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
49024+
#pragma clang loop vectorize(enable)
49025+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
49026+
((float*)pFramesOut)[iFrame*channels + iChannel] = ((const float*)pFramesIn)[iFrame*channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
4901049027
}
4901149028
}
4901249029
}
@@ -51376,7 +51393,7 @@ static void ma_linear_resampler_interpolate_frame_s16(ma_linear_resampler* pResa
5137651393

5137751394
a = (pResampler->inTimeFrac << shift) / pResampler->config.sampleRateOut;
5137851395

51379-
MA_ASSUME(channels > 0);
51396+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
5138051397
for (c = 0; c < channels; c += 1) {
5138151398
ma_int16 s = ma_linear_resampler_mix_s16(pResampler->x0.s16[c], pResampler->x1.s16[c], a, shift);
5138251399
pFrameOut[c] = s;
@@ -51395,7 +51412,7 @@ static void ma_linear_resampler_interpolate_frame_f32(ma_linear_resampler* pResa
5139551412

5139651413
a = (float)pResampler->inTimeFrac / pResampler->config.sampleRateOut;
5139751414

51398-
MA_ASSUME(channels > 0);
51415+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
5139951416
for (c = 0; c < channels; c += 1) {
5140051417
float s = ma_mix_f32_fast(pResampler->x0.f32[c], pResampler->x1.f32[c], a);
5140151418
pFrameOut[c] = s;
@@ -52630,6 +52647,7 @@ static void ma_channel_map_apply_shuffle_table_u8(ma_uint8* pFramesOut, ma_uint3
5263052647
ma_uint64 iFrame;
5263152648
ma_uint32 iChannelOut;
5263252649

52650+
#pragma clang loop unroll(disable)
5263352651
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
5263452652
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
5263552653
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -52650,6 +52668,7 @@ static void ma_channel_map_apply_shuffle_table_s16(ma_int16* pFramesOut, ma_uint
5265052668
ma_uint64 iFrame;
5265152669
ma_uint32 iChannelOut;
5265252670

52671+
#pragma clang loop unroll(disable)
5265352672
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
5265452673
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
5265552674
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -52692,6 +52711,7 @@ static void ma_channel_map_apply_shuffle_table_s32(ma_int32* pFramesOut, ma_uint
5269252711
ma_uint64 iFrame;
5269352712
ma_uint32 iChannelOut;
5269452713

52714+
#pragma clang loop unroll(disable)
5269552715
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
5269652716
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
5269752717
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -52712,6 +52732,7 @@ static void ma_channel_map_apply_shuffle_table_f32(float* pFramesOut, ma_uint32
5271252732
ma_uint64 iFrame;
5271352733
ma_uint32 iChannelOut;
5271452734

52735+
#pragma clang loop unroll(disable)
5271552736
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
5271652737
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
5271752738
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -52946,6 +52967,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
5294652967
} else
5294752968
#endif
5294852969
{
52970+
#pragma clang loop vectorize(enable)
5294952971
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
5295052972
for (iChannelOut = 0; iChannelOut < 2; iChannelOut += 1) {
5295152973
pFramesOut[iFrame*2 + iChannelOut] = pFramesIn[iFrame];
@@ -52973,6 +52995,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
5297352995
} else
5297452996
#endif
5297552997
{
52998+
#pragma clang loop vectorize(enable)
5297652999
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
5297753000
for (iChannelOut = 0; iChannelOut < 6; iChannelOut += 1) {
5297853001
pFramesOut[iFrame*6 + iChannelOut] = pFramesIn[iFrame];
@@ -52990,6 +53013,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
5299053013
} else
5299153014
#endif
5299253015
{
53016+
#pragma clang loop vectorize(enable)
5299353017
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
5299453018
for (iChannelOut = 0; iChannelOut < 8; iChannelOut += 1) {
5299553019
pFramesOut[iFrame*8 + iChannelOut] = pFramesIn[iFrame];
@@ -66313,7 +66337,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__white(ma_noise* pNoise, voi
6631366337
ma_uint64 iFrame;
6631466338
ma_uint32 iChannel;
6631566339
const ma_uint32 channels = pNoise->config.channels;
66316-
MA_ASSUME(channels > 0);
66340+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
6631766341

6631866342
if (pNoise->config.format == ma_format_f32) {
6631966343
float* pFramesOutF32 = (float*)pFramesOut;
@@ -66432,7 +66456,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__pink(ma_noise* pNoise, void
6643266456
ma_uint64 iFrame;
6643366457
ma_uint32 iChannel;
6643466458
const ma_uint32 channels = pNoise->config.channels;
66435-
MA_ASSUME(channels > 0);
66459+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
6643666460

6643766461
if (pNoise->config.format == ma_format_f32) {
6643866462
float* pFramesOutF32 = (float*)pFramesOut;
@@ -66514,7 +66538,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__brownian(ma_noise* pNoise,
6651466538
ma_uint64 iFrame;
6651566539
ma_uint32 iChannel;
6651666540
const ma_uint32 channels = pNoise->config.channels;
66517-
MA_ASSUME(channels > 0);
66541+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
6651866542

6651966543
if (pNoise->config.format == ma_format_f32) {
6652066544
float* pFramesOutF32 = (float*)pFramesOut;

0 commit comments

Comments
 (0)