Skip to content

Commit 8b7b5d2

Browse files
committed
clang: improve general performance with vectorization/unrolling
Clang has a tendency to *heavily* unroll loops all over the place: llvm/llvm-project#42332 Disable loop unrolling wherever it goes too nuts, enable vectorization where it doesn't do so automatically, etc. Signed-off-by: Steven Noonan <steven@uplinklabs.net>
1 parent 46d5056 commit 8b7b5d2

File tree

1 file changed

+55
-31
lines changed

1 file changed

+55
-31
lines changed

miniaudio.h

Lines changed: 55 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -42024,7 +42024,7 @@ MA_API void ma_copy_and_apply_volume_factor_s32(ma_int32* pSamplesOut, const ma_
4202442024
}
4202542025
}
4202642026

42027-
MA_API void ma_copy_and_apply_volume_factor_f32(float* pSamplesOut, const float* pSamplesIn, ma_uint64 sampleCount, float factor)
42027+
MA_API void ma_copy_and_apply_volume_factor_f32(float* MA_RESTRICT pSamplesOut, const float* MA_RESTRICT pSamplesIn, ma_uint64 sampleCount, float factor)
4202842028
{
4202942029
ma_uint64 iSample;
4203042030

@@ -44594,7 +44594,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_f32__direct_form_2_transposed(
4459444594
const float a1 = pBQ->a1.f32;
4459544595
const float a2 = pBQ->a2.f32;
4459644596

44597-
MA_ASSUME(channels > 0);
44597+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
44598+
#pragma clang loop unroll(disable)
4459844599
for (c = 0; c < channels; c += 1) {
4459944600
float r1 = pBQ->pR1[c].f32;
4460044601
float r2 = pBQ->pR2[c].f32;
@@ -44626,7 +44627,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_s16__direct_form_2_transposed(
4462644627
const ma_int32 a1 = pBQ->a1.s32;
4462744628
const ma_int32 a2 = pBQ->a2.s32;
4462844629

44629-
MA_ASSUME(channels > 0);
44630+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
44631+
#pragma clang loop unroll(disable)
4463044632
for (c = 0; c < channels; c += 1) {
4463144633
ma_int32 r1 = pBQ->pR1[c].s32;
4463244634
ma_int32 r2 = pBQ->pR2[c].s32;
@@ -44900,22 +44902,23 @@ MA_API ma_result ma_lpf1_clear_cache(ma_lpf1* pLPF)
4490044902
return MA_SUCCESS;
4490144903
}
4490244904

44903-
static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1* pLPF, float* pY, const float* pX)
44905+
static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1 *pLPF, float *pY, const float *pX)
4490444906
{
4490544907
ma_uint32 c;
4490644908
const ma_uint32 channels = pLPF->channels;
4490744909
const float a = pLPF->a.f32;
4490844910
const float b = 1 - a;
4490944911

44910-
MA_ASSUME(channels > 0);
44912+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
44913+
#pragma clang loop unroll(disable)
4491144914
for (c = 0; c < channels; c += 1) {
4491244915
float r1 = pLPF->pR1[c].f32;
44913-
float x = pX[c];
44916+
float x = pX[c];
4491444917
float y;
4491544918

44916-
y = b*x + a*r1;
44919+
y = b * x + a * r1;
4491744920

44918-
pY[c] = y;
44921+
pY[c] = y;
4491944922
pLPF->pR1[c].f32 = y;
4492044923
}
4492144924
}
@@ -44927,7 +44930,8 @@ static MA_INLINE void ma_lpf1_process_pcm_frame_s16(ma_lpf1* pLPF, ma_int16* pY,
4492744930
const ma_int32 a = pLPF->a.s32;
4492844931
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
4492944932

44930-
MA_ASSUME(channels > 0);
44933+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
44934+
#pragma clang loop unroll(disable)
4493144935
for (c = 0; c < channels; c += 1) {
4493244936
ma_int32 r1 = pLPF->pR1[c].s32;
4493344937
ma_int32 x = pX[c];
@@ -45780,7 +45784,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_f32(ma_hpf1* pHPF, float* pY, co
4578045784
const float a = 1 - pHPF->a.f32;
4578145785
const float b = 1 - a;
4578245786

45783-
MA_ASSUME(channels > 0);
45787+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
4578445788
for (c = 0; c < channels; c += 1) {
4578545789
float r1 = pHPF->pR1[c].f32;
4578645790
float x = pX[c];
@@ -45800,7 +45804,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_s16(ma_hpf1* pHPF, ma_int16* pY,
4580045804
const ma_int32 a = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - pHPF->a.s32);
4580145805
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
4580245806

45803-
MA_ASSUME(channels > 0);
45807+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
4580445808
for (c = 0; c < channels; c += 1) {
4580545809
ma_int32 r1 = pHPF->pR1[c].s32;
4580645810
ma_int32 x = pX[c];
@@ -47908,6 +47912,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4790847912
ma_uint64 iFrame;
4790947913
ma_uint32 iChannel;
4791047914
ma_uint64 interpolatedFrameCount;
47915+
const ma_uint32 channels = pGainer->config.channels;
4791147916

4791247917
MA_ASSERT(pGainer != NULL);
4791347918

@@ -47947,12 +47952,12 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4794747952
float a = (float)pGainer->t / pGainer->config.smoothTimeInFrames;
4794847953
float d = 1.0f / pGainer->config.smoothTimeInFrames;
4794947954

47950-
if (pGainer->config.channels <= 32) {
47955+
if (channels <= 32) {
4795147956
float pRunningGain[32];
4795247957
float pRunningGainDelta[32]; /* Could this be heap-allocated as part of the ma_gainer object? */
4795347958

4795447959
/* Initialize the running gain. */
47955-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
47960+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
4795647961
float t = (pGainer->pOldGains[iChannel] - pGainer->pNewGains[iChannel]) * pGainer->masterVolume;
4795747962
pRunningGainDelta[iChannel] = t * d;
4795847963
pRunningGain[iChannel] = (pGainer->pOldGains[iChannel] * pGainer->masterVolume) + (t * a);
@@ -47961,7 +47966,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4796147966
iFrame = 0;
4796247967

4796347968
/* Optimized paths for common channel counts. This is mostly just experimenting with some SIMD ideas. It's not necessarily final. */
47964-
if (pGainer->config.channels == 2) {
47969+
if (channels == 2) {
4796547970
#if defined(MA_SUPPORT_SSE2)
4796647971
if (ma_has_sse2()) {
4796747972
ma_uint64 unrolledLoopCount = interpolatedFrameCount >> 1;
@@ -48009,6 +48014,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4800948014

4801048015
iFrame = unrolledLoopCount << 1;
4801148016
#else
48017+
#pragma clang loop vectorize(enable)
4801248018
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
4801348019
for (iChannel = 0; iChannel < 2; iChannel += 1) {
4801448020
pFramesOutF32[iFrame*2 + iChannel] = pFramesInF32[iFrame*2 + iChannel] * pRunningGain[iChannel];
@@ -48020,7 +48026,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4802048026
}
4802148027
#endif
4802248028
}
48023-
} else if (pGainer->config.channels == 6) {
48029+
} else if (channels == 6) {
4802448030
#if defined(MA_SUPPORT_SSE2)
4802548031
if (ma_has_sse2()) {
4802648032
/*
@@ -48053,6 +48059,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4805348059
} else
4805448060
#endif
4805548061
{
48062+
#pragma clang loop vectorize(enable)
4805648063
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
4805748064
for (iChannel = 0; iChannel < 6; iChannel += 1) {
4805848065
pFramesOutF32[iFrame*6 + iChannel] = pFramesInF32[iFrame*6 + iChannel] * pRunningGain[iChannel];
@@ -48064,7 +48071,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4806448071
}
4806548072
}
4806648073
}
48067-
} else if (pGainer->config.channels == 8) {
48074+
} else if (channels == 8) {
4806848075
/* For 8 channels we can just go over frame by frame and do all eight channels as 2 separate 4x SIMD operations. */
4806948076
#if defined(MA_SUPPORT_SSE2)
4807048077
if (ma_has_sse2()) {
@@ -48084,6 +48091,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4808448091
#endif
4808548092
{
4808648093
/* This is crafted so that it auto-vectorizes when compiled with Clang. */
48094+
#pragma clang loop vectorize(enable)
4808748095
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
4808848096
for (iChannel = 0; iChannel < 8; iChannel += 1) {
4808948097
pFramesOutF32[iFrame*8 + iChannel] = pFramesInF32[iFrame*8 + iChannel] * pRunningGain[iChannel];
@@ -48097,17 +48105,21 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4809748105
}
4809848106
}
4809948107

48108+
#pragma clang loop unroll(disable)
4810048109
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48101-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48102-
pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * pRunningGain[iChannel];
48110+
#pragma clang loop vectorize(enable)
48111+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
48112+
pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * pRunningGain[iChannel];
4810348113
pRunningGain[iChannel] += pRunningGainDelta[iChannel];
4810448114
}
4810548115
}
4810648116
} else {
4810748117
/* Slower path for extreme channel counts where we can't fit enough on the stack. We could also move this to the heap as part of the ma_gainer object which might even be better since it'll only be updated when the gains actually change. */
48118+
#pragma clang loop unroll(disable)
4810848119
for (iFrame = 0; iFrame < interpolatedFrameCount; iFrame += 1) {
48109-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48110-
pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
48120+
#pragma clang loop vectorize(enable)
48121+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
48122+
pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
4811148123
}
4811248124

4811348125
a += d;
@@ -48126,18 +48138,21 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
4812648138

4812748139
/* All we need to do here is apply the new gains using an optimized path. */
4812848140
if (pFramesOut != NULL && pFramesIn != NULL) {
48129-
if (pGainer->config.channels <= 32) {
48141+
if (channels <= 32) {
4813048142
float gains[32];
48131-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48143+
#pragma clang loop unroll(disable)
48144+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
4813248145
gains[iChannel] = pGainer->pNewGains[iChannel] * pGainer->masterVolume;
4813348146
}
4813448147

48135-
ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, pGainer->config.channels, gains);
48148+
ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, channels, gains);
4813648149
} else {
4813748150
/* Slow path. Too many channels to fit on the stack. Need to apply a master volume as a separate path. */
48151+
#pragma clang loop unroll(disable)
4813848152
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
48139-
for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48140-
((float*)pFramesOut)[iFrame*pGainer->config.channels + iChannel] = ((const float*)pFramesIn)[iFrame*pGainer->config.channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
48153+
#pragma clang loop vectorize(enable)
48154+
for (iChannel = 0; iChannel < channels; iChannel += 1) {
48155+
((float*)pFramesOut)[iFrame*channels + iChannel] = ((const float*)pFramesIn)[iFrame*channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
4814148156
}
4814248157
}
4814348158
}
@@ -50498,7 +50513,7 @@ static void ma_linear_resampler_interpolate_frame_s16(ma_linear_resampler* pResa
5049850513

5049950514
a = (pResampler->inTimeFrac << shift) / pResampler->config.sampleRateOut;
5050050515

50501-
MA_ASSUME(channels > 0);
50516+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
5050250517
for (c = 0; c < channels; c += 1) {
5050350518
ma_int16 s = ma_linear_resampler_mix_s16(pResampler->x0.s16[c], pResampler->x1.s16[c], a, shift);
5050450519
pFrameOut[c] = s;
@@ -50517,7 +50532,7 @@ static void ma_linear_resampler_interpolate_frame_f32(ma_linear_resampler* pResa
5051750532

5051850533
a = (float)pResampler->inTimeFrac / pResampler->config.sampleRateOut;
5051950534

50520-
MA_ASSUME(channels > 0);
50535+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
5052150536
for (c = 0; c < channels; c += 1) {
5052250537
float s = ma_mix_f32_fast(pResampler->x0.f32[c], pResampler->x1.f32[c], a);
5052350538
pFrameOut[c] = s;
@@ -51752,6 +51767,7 @@ static void ma_channel_map_apply_shuffle_table_u8(ma_uint8* pFramesOut, ma_uint3
5175251767
ma_uint64 iFrame;
5175351768
ma_uint32 iChannelOut;
5175451769

51770+
#pragma clang loop unroll(disable)
5175551771
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
5175651772
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
5175751773
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -51772,6 +51788,7 @@ static void ma_channel_map_apply_shuffle_table_s16(ma_int16* pFramesOut, ma_uint
5177251788
ma_uint64 iFrame;
5177351789
ma_uint32 iChannelOut;
5177451790

51791+
#pragma clang loop unroll(disable)
5177551792
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
5177651793
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
5177751794
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -51814,6 +51831,7 @@ static void ma_channel_map_apply_shuffle_table_s32(ma_int32* pFramesOut, ma_uint
5181451831
ma_uint64 iFrame;
5181551832
ma_uint32 iChannelOut;
5181651833

51834+
#pragma clang loop unroll(disable)
5181751835
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
5181851836
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
5181951837
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -51834,6 +51852,7 @@ static void ma_channel_map_apply_shuffle_table_f32(float* pFramesOut, ma_uint32
5183451852
ma_uint64 iFrame;
5183551853
ma_uint32 iChannelOut;
5183651854

51855+
#pragma clang loop unroll(disable)
5183751856
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
5183851857
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
5183951858
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -52068,6 +52087,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
5206852087
} else
5206952088
#endif
5207052089
{
52090+
#pragma clang loop vectorize(enable)
5207152091
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
5207252092
for (iChannelOut = 0; iChannelOut < 2; iChannelOut += 1) {
5207352093
pFramesOut[iFrame*2 + iChannelOut] = pFramesIn[iFrame];
@@ -52095,6 +52115,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
5209552115
} else
5209652116
#endif
5209752117
{
52118+
#pragma clang loop vectorize(enable)
5209852119
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
5209952120
for (iChannelOut = 0; iChannelOut < 6; iChannelOut += 1) {
5210052121
pFramesOut[iFrame*6 + iChannelOut] = pFramesIn[iFrame];
@@ -52112,6 +52133,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
5211252133
} else
5211352134
#endif
5211452135
{
52136+
#pragma clang loop vectorize(enable)
5211552137
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
5211652138
for (iChannelOut = 0; iChannelOut < 8; iChannelOut += 1) {
5211752139
pFramesOut[iFrame*8 + iChannelOut] = pFramesIn[iFrame];
@@ -65280,7 +65302,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__white(ma_noise* pNoise, voi
6528065302
ma_uint64 iFrame;
6528165303
ma_uint32 iChannel;
6528265304
const ma_uint32 channels = pNoise->config.channels;
65283-
MA_ASSUME(channels > 0);
65305+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
6528465306

6528565307
if (pNoise->config.format == ma_format_f32) {
6528665308
float* pFramesOutF32 = (float*)pFramesOut;
@@ -65399,7 +65421,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__pink(ma_noise* pNoise, void
6539965421
ma_uint64 iFrame;
6540065422
ma_uint32 iChannel;
6540165423
const ma_uint32 channels = pNoise->config.channels;
65402-
MA_ASSUME(channels > 0);
65424+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
6540365425

6540465426
if (pNoise->config.format == ma_format_f32) {
6540565427
float* pFramesOutF32 = (float*)pFramesOut;
@@ -65481,7 +65503,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__brownian(ma_noise* pNoise,
6548165503
ma_uint64 iFrame;
6548265504
ma_uint32 iChannel;
6548365505
const ma_uint32 channels = pNoise->config.channels;
65484-
MA_ASSUME(channels > 0);
65506+
MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
6548565507

6548665508
if (pNoise->config.format == ma_format_f32) {
6548765509
float* pFramesOutF32 = (float*)pFramesOut;
@@ -69687,7 +69709,7 @@ MA_API void ma_debug_fill_pcm_frames_with_sine_wave(float* pFramesOut, ma_uint32
6968769709

6968869710

6968969711

69690-
static ma_result ma_mix_pcm_frames_f32(float* pDst, const float* pSrc, ma_uint64 frameCount, ma_uint32 channels, float volume)
69712+
static ma_result ma_mix_pcm_frames_f32(float* MA_RESTRICT pDst, const float* MA_RESTRICT pSrc, ma_uint64 frameCount, ma_uint32 channels, float volume)
6969169713
{
6969269714
ma_uint64 iSample;
6969369715
ma_uint64 sampleCount;
@@ -69703,10 +69725,12 @@ static ma_result ma_mix_pcm_frames_f32(float* pDst, const float* pSrc, ma_uint64
6970369725
sampleCount = frameCount * channels;
6970469726

6970569727
if (volume == 1) {
69728+
#pragma clang loop vectorize(enable)
6970669729
for (iSample = 0; iSample < sampleCount; iSample += 1) {
6970769730
pDst[iSample] += pSrc[iSample];
6970869731
}
6970969732
} else {
69733+
#pragma clang loop vectorize(enable)
6971069734
for (iSample = 0; iSample < sampleCount; iSample += 1) {
6971169735
pDst[iSample] += ma_apply_volume_unclipped_f32(pSrc[iSample], volume);
6971269736
}

0 commit comments

Comments
 (0)