@@ -42864,7 +42864,7 @@ MA_API void ma_copy_and_apply_volume_factor_s32(ma_int32* pSamplesOut, const ma_
42864
42864
}
42865
42865
}
42866
42866
42867
- MA_API void ma_copy_and_apply_volume_factor_f32(float* pSamplesOut, const float* pSamplesIn, ma_uint64 sampleCount, float factor)
42867
+ MA_API void ma_copy_and_apply_volume_factor_f32(float* MA_RESTRICT pSamplesOut, const float* MA_RESTRICT pSamplesIn, ma_uint64 sampleCount, float factor)
42868
42868
{
42869
42869
ma_uint64 iSample;
42870
42870
@@ -43159,10 +43159,12 @@ MA_API ma_result ma_mix_pcm_frames_f32(float* pDst, const float* pSrc, ma_uint64
43159
43159
sampleCount = frameCount * channels;
43160
43160
43161
43161
if (volume == 1) {
43162
+ #pragma clang loop vectorize(enable)
43162
43163
for (iSample = 0; iSample < sampleCount; iSample += 1) {
43163
43164
pDst[iSample] += pSrc[iSample];
43164
43165
}
43165
43166
} else {
43167
+ #pragma clang loop vectorize(enable)
43166
43168
for (iSample = 0; iSample < sampleCount; iSample += 1) {
43167
43169
pDst[iSample] += ma_apply_volume_unclipped_f32(pSrc[iSample], volume);
43168
43170
}
@@ -45463,7 +45465,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_f32__direct_form_2_transposed(
45463
45465
const float a1 = pBQ->a1.f32;
45464
45466
const float a2 = pBQ->a2.f32;
45465
45467
45466
- MA_ASSUME(channels > 0);
45468
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45469
+ #pragma clang loop unroll(disable)
45467
45470
for (c = 0; c < channels; c += 1) {
45468
45471
float r1 = pBQ->pR1[c].f32;
45469
45472
float r2 = pBQ->pR2[c].f32;
@@ -45495,7 +45498,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_s16__direct_form_2_transposed(
45495
45498
const ma_int32 a1 = pBQ->a1.s32;
45496
45499
const ma_int32 a2 = pBQ->a2.s32;
45497
45500
45498
- MA_ASSUME(channels > 0);
45501
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45502
+ #pragma clang loop unroll(disable)
45499
45503
for (c = 0; c < channels; c += 1) {
45500
45504
ma_int32 r1 = pBQ->pR1[c].s32;
45501
45505
ma_int32 r2 = pBQ->pR2[c].s32;
@@ -45769,22 +45773,23 @@ MA_API ma_result ma_lpf1_clear_cache(ma_lpf1* pLPF)
45769
45773
return MA_SUCCESS;
45770
45774
}
45771
45775
45772
- static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1* pLPF, float* pY, const float* pX)
45776
+ static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1 * pLPF, float * pY, const float * pX)
45773
45777
{
45774
45778
ma_uint32 c;
45775
45779
const ma_uint32 channels = pLPF->channels;
45776
45780
const float a = pLPF->a.f32;
45777
45781
const float b = 1 - a;
45778
45782
45779
- MA_ASSUME(channels > 0);
45783
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45784
+ #pragma clang loop unroll(disable)
45780
45785
for (c = 0; c < channels; c += 1) {
45781
45786
float r1 = pLPF->pR1[c].f32;
45782
- float x = pX[c];
45787
+ float x = pX[c];
45783
45788
float y;
45784
45789
45785
- y = b* x + a* r1;
45790
+ y = b * x + a * r1;
45786
45791
45787
- pY[c] = y;
45792
+ pY[c] = y;
45788
45793
pLPF->pR1[c].f32 = y;
45789
45794
}
45790
45795
}
@@ -45796,7 +45801,8 @@ static MA_INLINE void ma_lpf1_process_pcm_frame_s16(ma_lpf1* pLPF, ma_int16* pY,
45796
45801
const ma_int32 a = pLPF->a.s32;
45797
45802
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
45798
45803
45799
- MA_ASSUME(channels > 0);
45804
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45805
+ #pragma clang loop unroll(disable)
45800
45806
for (c = 0; c < channels; c += 1) {
45801
45807
ma_int32 r1 = pLPF->pR1[c].s32;
45802
45808
ma_int32 x = pX[c];
@@ -46649,7 +46655,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_f32(ma_hpf1* pHPF, float* pY, co
46649
46655
const float a = 1 - pHPF->a.f32;
46650
46656
const float b = 1 - a;
46651
46657
46652
- MA_ASSUME(channels > 0 );
46658
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
46653
46659
for (c = 0; c < channels; c += 1) {
46654
46660
float r1 = pHPF->pR1[c].f32;
46655
46661
float x = pX[c];
@@ -46669,7 +46675,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_s16(ma_hpf1* pHPF, ma_int16* pY,
46669
46675
const ma_int32 a = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - pHPF->a.s32);
46670
46676
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
46671
46677
46672
- MA_ASSUME(channels > 0 );
46678
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
46673
46679
for (c = 0; c < channels; c += 1) {
46674
46680
ma_int32 r1 = pHPF->pR1[c].s32;
46675
46681
ma_int32 x = pX[c];
@@ -48777,6 +48783,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48777
48783
ma_uint64 iFrame;
48778
48784
ma_uint32 iChannel;
48779
48785
ma_uint64 interpolatedFrameCount;
48786
+ const ma_uint32 channels = pGainer->config.channels;
48780
48787
48781
48788
MA_ASSERT(pGainer != NULL);
48782
48789
@@ -48816,12 +48823,12 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48816
48823
float a = (float)pGainer->t / pGainer->config.smoothTimeInFrames;
48817
48824
float d = 1.0f / pGainer->config.smoothTimeInFrames;
48818
48825
48819
- if (pGainer->config. channels <= 32) {
48826
+ if (channels <= 32) {
48820
48827
float pRunningGain[32];
48821
48828
float pRunningGainDelta[32]; /* Could this be heap-allocated as part of the ma_gainer object? */
48822
48829
48823
48830
/* Initialize the running gain. */
48824
- for (iChannel = 0; iChannel < pGainer->config. channels; iChannel += 1) {
48831
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
48825
48832
float t = (pGainer->pNewGains[iChannel] - pGainer->pOldGains[iChannel]) * pGainer->masterVolume;
48826
48833
pRunningGainDelta[iChannel] = t * d;
48827
48834
pRunningGain[iChannel] = (pGainer->pOldGains[iChannel] * pGainer->masterVolume) + (t * a);
@@ -48830,7 +48837,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48830
48837
iFrame = 0;
48831
48838
48832
48839
/* Optimized paths for common channel counts. This is mostly just experimenting with some SIMD ideas. It's not necessarily final. */
48833
- if (pGainer->config. channels == 2) {
48840
+ if (channels == 2) {
48834
48841
#if defined(MA_SUPPORT_SSE2)
48835
48842
if (ma_has_sse2()) {
48836
48843
ma_uint64 unrolledLoopCount = interpolatedFrameCount >> 1;
@@ -48878,6 +48885,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48878
48885
48879
48886
iFrame = unrolledLoopCount << 1;
48880
48887
#else
48888
+ #pragma clang loop vectorize(enable)
48881
48889
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48882
48890
for (iChannel = 0; iChannel < 2; iChannel += 1) {
48883
48891
pFramesOutF32[iFrame*2 + iChannel] = pFramesInF32[iFrame*2 + iChannel] * pRunningGain[iChannel];
@@ -48889,7 +48897,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48889
48897
}
48890
48898
#endif
48891
48899
}
48892
- } else if (pGainer->config. channels == 6) {
48900
+ } else if (channels == 6) {
48893
48901
#if defined(MA_SUPPORT_SSE2)
48894
48902
if (ma_has_sse2()) {
48895
48903
/*
@@ -48922,6 +48930,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48922
48930
} else
48923
48931
#endif
48924
48932
{
48933
+ #pragma clang loop vectorize(enable)
48925
48934
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48926
48935
for (iChannel = 0; iChannel < 6; iChannel += 1) {
48927
48936
pFramesOutF32[iFrame*6 + iChannel] = pFramesInF32[iFrame*6 + iChannel] * pRunningGain[iChannel];
@@ -48933,7 +48942,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48933
48942
}
48934
48943
}
48935
48944
}
48936
- } else if (pGainer->config. channels == 8) {
48945
+ } else if (channels == 8) {
48937
48946
/* For 8 channels we can just go over frame by frame and do all eight channels as 2 separate 4x SIMD operations. */
48938
48947
#if defined(MA_SUPPORT_SSE2)
48939
48948
if (ma_has_sse2()) {
@@ -48953,6 +48962,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48953
48962
#endif
48954
48963
{
48955
48964
/* This is crafted so that it auto-vectorizes when compiled with Clang. */
48965
+ #pragma clang loop vectorize(enable)
48956
48966
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48957
48967
for (iChannel = 0; iChannel < 8; iChannel += 1) {
48958
48968
pFramesOutF32[iFrame*8 + iChannel] = pFramesInF32[iFrame*8 + iChannel] * pRunningGain[iChannel];
@@ -48966,17 +48976,21 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48966
48976
}
48967
48977
}
48968
48978
48979
+ #pragma clang loop unroll(disable)
48969
48980
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48970
- for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48971
- pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * pRunningGain[iChannel];
48981
+ #pragma clang loop vectorize(enable)
48982
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
48983
+ pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * pRunningGain[iChannel];
48972
48984
pRunningGain[iChannel] += pRunningGainDelta[iChannel];
48973
48985
}
48974
48986
}
48975
48987
} else {
48976
48988
/* Slower path for extreme channel counts where we can't fit enough on the stack. We could also move this to the heap as part of the ma_gainer object which might even be better since it'll only be updated when the gains actually change. */
48989
+ #pragma clang loop unroll(disable)
48977
48990
for (iFrame = 0; iFrame < interpolatedFrameCount; iFrame += 1) {
48978
- for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48979
- pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
48991
+ #pragma clang loop vectorize(enable)
48992
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
48993
+ pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
48980
48994
}
48981
48995
48982
48996
a += d;
@@ -48995,18 +49009,21 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48995
49009
48996
49010
/* All we need to do here is apply the new gains using an optimized path. */
48997
49011
if (pFramesOut != NULL && pFramesIn != NULL) {
48998
- if (pGainer->config. channels <= 32) {
49012
+ if (channels <= 32) {
48999
49013
float gains[32];
49000
- for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
49014
+ #pragma clang loop unroll(disable)
49015
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
49001
49016
gains[iChannel] = pGainer->pNewGains[iChannel] * pGainer->masterVolume;
49002
49017
}
49003
49018
49004
- ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, pGainer->config. channels, gains);
49019
+ ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, channels, gains);
49005
49020
} else {
49006
49021
/* Slow path. Too many channels to fit on the stack. Need to apply a master volume as a separate path. */
49022
+ #pragma clang loop unroll(disable)
49007
49023
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
49008
- for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
49009
- ((float*)pFramesOut)[iFrame*pGainer->config.channels + iChannel] = ((const float*)pFramesIn)[iFrame*pGainer->config.channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
49024
+ #pragma clang loop vectorize(enable)
49025
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
49026
+ ((float*)pFramesOut)[iFrame*channels + iChannel] = ((const float*)pFramesIn)[iFrame*channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
49010
49027
}
49011
49028
}
49012
49029
}
@@ -51376,7 +51393,7 @@ static void ma_linear_resampler_interpolate_frame_s16(ma_linear_resampler* pResa
51376
51393
51377
51394
a = (pResampler->inTimeFrac << shift) / pResampler->config.sampleRateOut;
51378
51395
51379
- MA_ASSUME(channels > 0 );
51396
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
51380
51397
for (c = 0; c < channels; c += 1) {
51381
51398
ma_int16 s = ma_linear_resampler_mix_s16(pResampler->x0.s16[c], pResampler->x1.s16[c], a, shift);
51382
51399
pFrameOut[c] = s;
@@ -51395,7 +51412,7 @@ static void ma_linear_resampler_interpolate_frame_f32(ma_linear_resampler* pResa
51395
51412
51396
51413
a = (float)pResampler->inTimeFrac / pResampler->config.sampleRateOut;
51397
51414
51398
- MA_ASSUME(channels > 0 );
51415
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
51399
51416
for (c = 0; c < channels; c += 1) {
51400
51417
float s = ma_mix_f32_fast(pResampler->x0.f32[c], pResampler->x1.f32[c], a);
51401
51418
pFrameOut[c] = s;
@@ -52630,6 +52647,7 @@ static void ma_channel_map_apply_shuffle_table_u8(ma_uint8* pFramesOut, ma_uint3
52630
52647
ma_uint64 iFrame;
52631
52648
ma_uint32 iChannelOut;
52632
52649
52650
+ #pragma clang loop unroll(disable)
52633
52651
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52634
52652
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
52635
52653
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -52650,6 +52668,7 @@ static void ma_channel_map_apply_shuffle_table_s16(ma_int16* pFramesOut, ma_uint
52650
52668
ma_uint64 iFrame;
52651
52669
ma_uint32 iChannelOut;
52652
52670
52671
+ #pragma clang loop unroll(disable)
52653
52672
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52654
52673
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
52655
52674
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -52692,6 +52711,7 @@ static void ma_channel_map_apply_shuffle_table_s32(ma_int32* pFramesOut, ma_uint
52692
52711
ma_uint64 iFrame;
52693
52712
ma_uint32 iChannelOut;
52694
52713
52714
+ #pragma clang loop unroll(disable)
52695
52715
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52696
52716
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
52697
52717
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -52712,6 +52732,7 @@ static void ma_channel_map_apply_shuffle_table_f32(float* pFramesOut, ma_uint32
52712
52732
ma_uint64 iFrame;
52713
52733
ma_uint32 iChannelOut;
52714
52734
52735
+ #pragma clang loop unroll(disable)
52715
52736
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52716
52737
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
52717
52738
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -52946,6 +52967,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
52946
52967
} else
52947
52968
#endif
52948
52969
{
52970
+ #pragma clang loop vectorize(enable)
52949
52971
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52950
52972
for (iChannelOut = 0; iChannelOut < 2; iChannelOut += 1) {
52951
52973
pFramesOut[iFrame*2 + iChannelOut] = pFramesIn[iFrame];
@@ -52973,6 +52995,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
52973
52995
} else
52974
52996
#endif
52975
52997
{
52998
+ #pragma clang loop vectorize(enable)
52976
52999
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52977
53000
for (iChannelOut = 0; iChannelOut < 6; iChannelOut += 1) {
52978
53001
pFramesOut[iFrame*6 + iChannelOut] = pFramesIn[iFrame];
@@ -52990,6 +53013,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
52990
53013
} else
52991
53014
#endif
52992
53015
{
53016
+ #pragma clang loop vectorize(enable)
52993
53017
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52994
53018
for (iChannelOut = 0; iChannelOut < 8; iChannelOut += 1) {
52995
53019
pFramesOut[iFrame*8 + iChannelOut] = pFramesIn[iFrame];
@@ -66313,7 +66337,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__white(ma_noise* pNoise, voi
66313
66337
ma_uint64 iFrame;
66314
66338
ma_uint32 iChannel;
66315
66339
const ma_uint32 channels = pNoise->config.channels;
66316
- MA_ASSUME(channels > 0 );
66340
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
66317
66341
66318
66342
if (pNoise->config.format == ma_format_f32) {
66319
66343
float* pFramesOutF32 = (float*)pFramesOut;
@@ -66432,7 +66456,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__pink(ma_noise* pNoise, void
66432
66456
ma_uint64 iFrame;
66433
66457
ma_uint32 iChannel;
66434
66458
const ma_uint32 channels = pNoise->config.channels;
66435
- MA_ASSUME(channels > 0 );
66459
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
66436
66460
66437
66461
if (pNoise->config.format == ma_format_f32) {
66438
66462
float* pFramesOutF32 = (float*)pFramesOut;
@@ -66514,7 +66538,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__brownian(ma_noise* pNoise,
66514
66538
ma_uint64 iFrame;
66515
66539
ma_uint32 iChannel;
66516
66540
const ma_uint32 channels = pNoise->config.channels;
66517
- MA_ASSUME(channels > 0 );
66541
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
66518
66542
66519
66543
if (pNoise->config.format == ma_format_f32) {
66520
66544
float* pFramesOutF32 = (float*)pFramesOut;
0 commit comments