Skip to content

Commit fba3504

Browse files
committed
clang: improve general performance with vectorization/unrolling
Clang has a tendency to *heavily* unroll loops all over the place: llvm/llvm-project#42332 Disable loop unrolling wherever it goes too nuts, enable vectorization where it doesn't do so automatically, etc. Signed-off-by: Steven Noonan <steven@uplinklabs.net>
1 parent 04fb5e9 commit fba3504

File tree

1 file changed

+55
-32
lines changed

1 file changed

+55
-32
lines changed

miniaudio.h

Lines changed: 55 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -42835,7 +42835,7 @@ MA_API void ma_copy_and_apply_volume_factor_s32(ma_int32* pSamplesOut, const ma_
4283542835
}
4283642836
}
4283742837

42838-
MA_API void ma_copy_and_apply_volume_factor_f32(float* pSamplesOut, const float* pSamplesIn, ma_uint64 sampleCount, float factor)
42838+
MA_API void ma_copy_and_apply_volume_factor_f32(float* MA_RESTRICT pSamplesOut, const float* MA_RESTRICT pSamplesIn, ma_uint64 sampleCount, float factor)
4283942839
{
4284042840
ma_uint64 iSample;
4284142841

@@ -43130,10 +43130,12 @@ MA_API ma_result ma_mix_pcm_frames_f32(float* pDst, const float* pSrc, ma_uint64
4313043130
sampleCount = frameCount * channels;
4313143131

4313243132
if (volume == 1) {
43133+
#pragma clang loop vectorize(enable)
4313343134
for (iSample = 0; iSample < sampleCount; iSample += 1) {
4313443135
pDst[iSample] += pSrc[iSample];
4313543136
}
4313643137
} else {
43138+
#pragma clang loop vectorize(enable)
4313743139
for (iSample = 0; iSample < sampleCount; iSample += 1) {
4313843140
pDst[iSample] += ma_apply_volume_unclipped_f32(pSrc[iSample], volume);
4313943141
}
@@ -45434,7 +45436,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_f32__direct_form_2_transposed(
4543445436
const float a1 = pBQ->a1.f32;
4543545437
const float a2 = pBQ->a2.f32;
4543645438

45437-
MA_ASSUME(channels > 0);
45439+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45440+
#pragma clang loop vectorize(assume_safety)
4543845441
for (c = 0; c < channels; c += 1) {
4543945442
float r1 = pBQ->pR1[c].f32;
4544045443
float r2 = pBQ->pR2[c].f32;
@@ -45466,7 +45469,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_s16__direct_form_2_transposed(
4546645469
const ma_int32 a1 = pBQ->a1.s32;
4546745470
const ma_int32 a2 = pBQ->a2.s32;
4546845471

45469-
MA_ASSUME(channels > 0);
45472+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45473+
#pragma clang loop vectorize(assume_safety)
4547045474
for (c = 0; c < channels; c += 1) {
4547145475
ma_int32 r1 = pBQ->pR1[c].s32;
4547245476
ma_int32 r2 = pBQ->pR2[c].s32;
@@ -45740,22 +45744,23 @@ MA_API ma_result ma_lpf1_clear_cache(ma_lpf1* pLPF)
4574045744
return MA_SUCCESS;
4574145745
}
4574245746

45743-
static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1* pLPF, float* pY, const float* pX)
45747+
static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1 *pLPF, float *pY, const float *pX)
4574445748
{
4574545749
ma_uint32 c;
4574645750
const ma_uint32 channels = pLPF->channels;
4574745751
const float a = pLPF->a.f32;
4574845752
const float b = 1 - a;
4574945753

45750-
MA_ASSUME(channels > 0);
45754+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45755+
#pragma clang loop vectorize(assume_safety)
4575145756
for (c = 0; c < channels; c += 1) {
4575245757
float r1 = pLPF->pR1[c].f32;
45753-
float x = pX[c];
45758+
float x = pX[c];
4575445759
float y;
4575545760

45756-
y = b*x + a*r1;
45761+
y = b * x + a * r1;
4575745762

45758-
pY[c] = y;
45763+
pY[c] = y;
4575945764
pLPF->pR1[c].f32 = y;
4576045765
}
4576145766
}
@@ -45767,7 +45772,8 @@ static MA_INLINE void ma_lpf1_process_pcm_frame_s16(ma_lpf1* pLPF, ma_int16* pY,
4576745772
const ma_int32 a = pLPF->a.s32;
4576845773
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
4576945774

45770-
MA_ASSUME(channels > 0);
45775+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45776+
#pragma clang loop vectorize(assume_safety)
4577145777
for (c = 0; c < channels; c += 1) {
4577245778
ma_int32 r1 = pLPF->pR1[c].s32;
4577345779
ma_int32 x = pX[c];
@@ -46620,7 +46626,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_f32(ma_hpf1* pHPF, float* pY, co
4662046626
const float a = 1 - pHPF->a.f32;
4662146627
const float b = 1 - a;
4662246628

46623-
MA_ASSUME(channels > 0);
46629+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
4662446630
for (c = 0; c < channels; c += 1) {
4662546631
float r1 = pHPF->pR1[c].f32;
4662646632
float x = pX[c];
@@ -46640,7 +46646,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_s16(ma_hpf1* pHPF, ma_int16* pY,
4664046646
const ma_int32 a = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - pHPF->a.s32);
4664146647
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
4664246648

46643-
MA_ASSUME(channels > 0);
46649+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
4664446650
for (c = 0; c < channels; c += 1) {
4664546651
ma_int32 r1 = pHPF->pR1[c].s32;
4664646652
ma_int32 x = pX[c];
@@ -48748,6 +48754,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4874848754
ma_uint64 iFrame;
4874948755
ma_uint32 iChannel;
4875048756
ma_uint64 interpolatedFrameCount;
48757+
const ma_uint32 channels = pGainer->config.channels;
4875148758

4875248759
MA_ASSERT(pGainer != NULL);
4875348760

@@ -48787,12 +48794,12 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4878748794
float a = (float)pGainer->t / pGainer->config.smoothTimeInFrames;
4878848795
float d = 1.0f / pGainer->config.smoothTimeInFrames;
4878948796

48790-
if (pGainer->config.channels <= 32) {
48797+
if (channels <= 32) {
4879148798
float pRunningGain[32];
4879248799
float pRunningGainDelta[32]; /* Could this be heap-allocated as part of the ma_gainer object? */
4879348800

4879448801
/* Initialize the running gain. */
48795-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48802+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
4879648803
float t = (pGainer->pNewGains[iChannel] - pGainer->pOldGains[iChannel]) * pGainer->masterVolume;
4879748804
pRunningGainDelta[iChannel] = t * d;
4879848805
pRunningGain[iChannel] = (pGainer->pOldGains[iChannel] * pGainer->masterVolume) + (t * a);
@@ -48801,7 +48808,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4880148808
iFrame = 0;
4880248809

4880348810
/* Optimized paths for common channel counts. This is mostly just experimenting with some SIMD ideas. It's not necessarily final. */
48804-
if (pGainer->config.channels == 2) {
48811+
if (channels == 2) {
4880548812
#if defined(MA_SUPPORT_SSE2)
4880648813
if (ma_has_sse2()) {
4880748814
ma_uint64 unrolledLoopCount = interpolatedFrameCount >> 1;
@@ -48849,6 +48856,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4884948856

4885048857
iFrame = unrolledLoopCount << 1;
4885148858
#else
48859+
#pragma clang loop vectorize(enable)
4885248860
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
4885348861
for (iChannel = 0; iChannel < 2; iChannel += 1) {
4885448862
pFramesOutF32[iFrame*2 + iChannel] = pFramesInF32[iFrame*2 + iChannel] * pRunningGain[iChannel];
@@ -48860,7 +48868,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4886048868
}
4886148869
#endif
4886248870
}
48863-
} else if (pGainer->config.channels == 6) {
48871+
} else if (channels == 6) {
4886448872
#if defined(MA_SUPPORT_SSE2)
4886548873
if (ma_has_sse2()) {
4886648874
/*
@@ -48904,7 +48912,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4890448912
}
4890548913
}
4890648914
}
48907-
} else if (pGainer->config.channels == 8) {
48915+
} else if (channels == 8) {
4890848916
/* For 8 channels we can just go over frame by frame and do all eight channels as 2 separate 4x SIMD operations. */
4890948917
#if defined(MA_SUPPORT_SSE2)
4891048918
if (ma_has_sse2()) {
@@ -48925,29 +48933,35 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4892548933
{
4892648934
/* This is crafted so that it auto-vectorizes when compiled with Clang. */
4892748935
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48936+
#pragma clang loop vectorize(enable)
4892848937
for (iChannel = 0; iChannel < 8; iChannel += 1) {
4892948938
pFramesOutF32[iFrame*8 + iChannel] = pFramesInF32[iFrame*8 + iChannel] * pRunningGain[iChannel];
4893048939
}
4893148940

4893248941
/* Move the running gain forward towards the new gain. */
48942+
#pragma clang loop vectorize(enable)
4893348943
for (iChannel = 0; iChannel < 8; iChannel += 1) {
4893448944
pRunningGain[iChannel] += pRunningGainDelta[iChannel];
4893548945
}
4893648946
}
4893748947
}
4893848948
}
4893948949

48950+
#pragma clang loop unroll(disable)
4894048951
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48941-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48942-
pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * pRunningGain[iChannel];
48952+
#pragma clang loop vectorize(enable)
48953+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
48954+
pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * pRunningGain[iChannel];
4894348955
pRunningGain[iChannel] += pRunningGainDelta[iChannel];
4894448956
}
4894548957
}
4894648958
} else {
4894748959
/* Slower path for extreme channel counts where we can't fit enough on the stack. We could also move this to the heap as part of the ma_gainer object which might even be better since it'll only be updated when the gains actually change. */
48960+
#pragma clang loop unroll(disable)
4894848961
for (iFrame = 0; iFrame < interpolatedFrameCount; iFrame += 1) {
48949-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48950-
pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
48962+
#pragma clang loop vectorize(enable)
48963+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
48964+
pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
4895148965
}
4895248966

4895348967
a += d;
@@ -48966,18 +48980,20 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4896648980

4896748981
/* All we need to do here is apply the new gains using an optimized path. */
4896848982
if (pFramesOut != NULL && pFramesIn != NULL) {
48969-
if (pGainer->config.channels <= 32) {
48983+
if (channels <= 32) {
4897048984
float gains[32];
48971-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48985+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
4897248986
gains[iChannel] = pGainer->pNewGains[iChannel] * pGainer->masterVolume;
4897348987
}
4897448988

48975-
ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, pGainer->config.channels, gains);
48989+
ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, channels, gains);
4897648990
} else {
4897748991
/* Slow path. Too many channels to fit on the stack. Need to apply a master volume as a separate path. */
48992+
#pragma clang loop unroll(disable)
4897848993
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
48979-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48980-
((float*)pFramesOut)[iFrame*pGainer->config.channels + iChannel] = ((const float*)pFramesIn)[iFrame*pGainer->config.channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
48994+
#pragma clang loop vectorize(enable)
48995+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
48996+
((float*)pFramesOut)[iFrame*channels + iChannel] = ((const float*)pFramesIn)[iFrame*channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
4898148997
}
4898248998
}
4898348999
}
@@ -51347,7 +51363,7 @@ static void ma_linear_resampler_interpolate_frame_s16(ma_linear_resampler* pResa
5134751363

5134851364
a = (pResampler->inTimeFrac << shift) / pResampler->config.sampleRateOut;
5134951365

51350-
MA_ASSUME(channels > 0);
51366+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
5135151367
for (c = 0; c < channels; c += 1) {
5135251368
ma_int16 s = ma_linear_resampler_mix_s16(pResampler->x0.s16[c], pResampler->x1.s16[c], a, shift);
5135351369
pFrameOut[c] = s;
@@ -51366,7 +51382,7 @@ static void ma_linear_resampler_interpolate_frame_f32(ma_linear_resampler* pResa
5136651382

5136751383
a = (float)pResampler->inTimeFrac / pResampler->config.sampleRateOut;
5136851384

51369-
MA_ASSUME(channels > 0);
51385+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
5137051386
for (c = 0; c < channels; c += 1) {
5137151387
float s = ma_mix_f32_fast(pResampler->x0.f32[c], pResampler->x1.f32[c], a);
5137251388
pFrameOut[c] = s;
@@ -51533,7 +51549,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16(ma_linear_resampler*
5153351549
}
5153451550

5153551551

51536-
static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
51552+
static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear_resampler* MA_RESTRICT pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
5153751553
{
5153851554
const float* pFramesInF32;
5153951555
/* */ float* pFramesOutF32;
@@ -51559,12 +51575,14 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear
5155951575
ma_uint32 iChannel;
5156051576

5156151577
if (pFramesInF32 != NULL) {
51578+
#pragma clang loop vectorize(assume_safety)
5156251579
for (iChannel = 0; iChannel < pResampler->config.channels; iChannel += 1) {
5156351580
pResampler->x0.f32[iChannel] = pResampler->x1.f32[iChannel];
5156451581
pResampler->x1.f32[iChannel] = pFramesInF32[iChannel];
5156551582
}
5156651583
pFramesInF32 += pResampler->config.channels;
5156751584
} else {
51585+
#pragma clang loop vectorize(assume_safety)
5156851586
for (iChannel = 0; iChannel < pResampler->config.channels; iChannel += 1) {
5156951587
pResampler->x0.f32[iChannel] = pResampler->x1.f32[iChannel];
5157051588
pResampler->x1.f32[iChannel] = 0;
@@ -51607,7 +51625,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear
5160751625
return MA_SUCCESS;
5160851626
}
5160951627

51610-
static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
51628+
static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_resampler* MA_RESTRICT pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
5161151629
{
5161251630
const float* pFramesInF32;
5161351631
/* */ float* pFramesOutF32;
@@ -51633,12 +51651,14 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_r
5163351651
ma_uint32 iChannel;
5163451652

5163551653
if (pFramesInF32 != NULL) {
51654+
#pragma clang loop vectorize(assume_safety)
5163651655
for (iChannel = 0; iChannel < pResampler->config.channels; iChannel += 1) {
5163751656
pResampler->x0.f32[iChannel] = pResampler->x1.f32[iChannel];
5163851657
pResampler->x1.f32[iChannel] = pFramesInF32[iChannel];
5163951658
}
5164051659
pFramesInF32 += pResampler->config.channels;
5164151660
} else {
51661+
#pragma clang loop vectorize(assume_safety)
5164251662
for (iChannel = 0; iChannel < pResampler->config.channels; iChannel += 1) {
5164351663
pResampler->x0.f32[iChannel] = pResampler->x1.f32[iChannel];
5164451664
pResampler->x1.f32[iChannel] = 0;
@@ -52918,6 +52938,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
5291852938
#endif
5291952939
{
5292052940
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52941+
#pragma clang loop vectorize(enable)
5292152942
for (iChannelOut = 0; iChannelOut < 2; iChannelOut += 1) {
5292252943
pFramesOut[iFrame*2 + iChannelOut] = pFramesIn[iFrame];
5292352944
}
@@ -52945,6 +52966,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
5294552966
#endif
5294652967
{
5294752968
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52969+
#pragma clang loop vectorize(enable)
5294852970
for (iChannelOut = 0; iChannelOut < 6; iChannelOut += 1) {
5294952971
pFramesOut[iFrame*6 + iChannelOut] = pFramesIn[iFrame];
5295052972
}
@@ -52962,6 +52984,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
5296252984
#endif
5296352985
{
5296452986
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52987+
#pragma clang loop vectorize(enable)
5296552988
for (iChannelOut = 0; iChannelOut < 8; iChannelOut += 1) {
5296652989
pFramesOut[iFrame*8 + iChannelOut] = pFramesIn[iFrame];
5296752990
}
@@ -66051,7 +66074,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__white(ma_noise* pNoise, voi
6605166074
ma_uint64 iFrame;
6605266075
ma_uint32 iChannel;
6605366076
const ma_uint32 channels = pNoise->config.channels;
66054-
MA_ASSUME(channels > 0);
66077+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
6605566078

6605666079
if (pNoise->config.format == ma_format_f32) {
6605766080
float* pFramesOutF32 = (float*)pFramesOut;
@@ -66170,7 +66193,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__pink(ma_noise* pNoise, void
6617066193
ma_uint64 iFrame;
6617166194
ma_uint32 iChannel;
6617266195
const ma_uint32 channels = pNoise->config.channels;
66173-
MA_ASSUME(channels > 0);
66196+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
6617466197

6617566198
if (pNoise->config.format == ma_format_f32) {
6617666199
float* pFramesOutF32 = (float*)pFramesOut;
@@ -66252,7 +66275,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__brownian(ma_noise* pNoise,
6625266275
ma_uint64 iFrame;
6625366276
ma_uint32 iChannel;
6625466277
const ma_uint32 channels = pNoise->config.channels;
66255-
MA_ASSUME(channels > 0);
66278+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
6625666279

6625766280
if (pNoise->config.format == ma_format_f32) {
6625866281
float* pFramesOutF32 = (float*)pFramesOut;

0 commit comments

Comments
 (0)