@@ -42024,7 +42024,7 @@ MA_API void ma_copy_and_apply_volume_factor_s32(ma_int32* pSamplesOut, const ma_
42024
42024
}
42025
42025
}
42026
42026
42027
- MA_API void ma_copy_and_apply_volume_factor_f32(float* pSamplesOut, const float* pSamplesIn, ma_uint64 sampleCount, float factor)
42027
+ MA_API void ma_copy_and_apply_volume_factor_f32(float* MA_RESTRICT pSamplesOut, const float* MA_RESTRICT pSamplesIn, ma_uint64 sampleCount, float factor)
42028
42028
{
42029
42029
ma_uint64 iSample;
42030
42030
@@ -44594,7 +44594,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_f32__direct_form_2_transposed(
44594
44594
const float a1 = pBQ->a1.f32;
44595
44595
const float a2 = pBQ->a2.f32;
44596
44596
44597
- MA_ASSUME(channels > 0);
44597
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
44598
+ #pragma clang loop unroll(disable)
44598
44599
for (c = 0; c < channels; c += 1) {
44599
44600
float r1 = pBQ->pR1[c].f32;
44600
44601
float r2 = pBQ->pR2[c].f32;
@@ -44626,7 +44627,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_s16__direct_form_2_transposed(
44626
44627
const ma_int32 a1 = pBQ->a1.s32;
44627
44628
const ma_int32 a2 = pBQ->a2.s32;
44628
44629
44629
- MA_ASSUME(channels > 0);
44630
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
44631
+ #pragma clang loop unroll(disable)
44630
44632
for (c = 0; c < channels; c += 1) {
44631
44633
ma_int32 r1 = pBQ->pR1[c].s32;
44632
44634
ma_int32 r2 = pBQ->pR2[c].s32;
@@ -44900,22 +44902,23 @@ MA_API ma_result ma_lpf1_clear_cache(ma_lpf1* pLPF)
44900
44902
return MA_SUCCESS;
44901
44903
}
44902
44904
44903
- static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1* pLPF, float* pY, const float* pX)
44905
+ static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1 * pLPF, float * pY, const float * pX)
44904
44906
{
44905
44907
ma_uint32 c;
44906
44908
const ma_uint32 channels = pLPF->channels;
44907
44909
const float a = pLPF->a.f32;
44908
44910
const float b = 1 - a;
44909
44911
44910
- MA_ASSUME(channels > 0);
44912
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
44913
+ #pragma clang loop unroll(disable)
44911
44914
for (c = 0; c < channels; c += 1) {
44912
44915
float r1 = pLPF->pR1[c].f32;
44913
- float x = pX[c];
44916
+ float x = pX[c];
44914
44917
float y;
44915
44918
44916
- y = b* x + a* r1;
44919
+ y = b * x + a * r1;
44917
44920
44918
- pY[c] = y;
44921
+ pY[c] = y;
44919
44922
pLPF->pR1[c].f32 = y;
44920
44923
}
44921
44924
}
@@ -44927,7 +44930,8 @@ static MA_INLINE void ma_lpf1_process_pcm_frame_s16(ma_lpf1* pLPF, ma_int16* pY,
44927
44930
const ma_int32 a = pLPF->a.s32;
44928
44931
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
44929
44932
44930
- MA_ASSUME(channels > 0);
44933
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
44934
+ #pragma clang loop unroll(disable)
44931
44935
for (c = 0; c < channels; c += 1) {
44932
44936
ma_int32 r1 = pLPF->pR1[c].s32;
44933
44937
ma_int32 x = pX[c];
@@ -45780,7 +45784,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_f32(ma_hpf1* pHPF, float* pY, co
45780
45784
const float a = 1 - pHPF->a.f32;
45781
45785
const float b = 1 - a;
45782
45786
45783
- MA_ASSUME(channels > 0 );
45787
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
45784
45788
for (c = 0; c < channels; c += 1) {
45785
45789
float r1 = pHPF->pR1[c].f32;
45786
45790
float x = pX[c];
@@ -45800,7 +45804,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_s16(ma_hpf1* pHPF, ma_int16* pY,
45800
45804
const ma_int32 a = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - pHPF->a.s32);
45801
45805
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
45802
45806
45803
- MA_ASSUME(channels > 0 );
45807
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
45804
45808
for (c = 0; c < channels; c += 1) {
45805
45809
ma_int32 r1 = pHPF->pR1[c].s32;
45806
45810
ma_int32 x = pX[c];
@@ -47908,6 +47912,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
47908
47912
ma_uint64 iFrame;
47909
47913
ma_uint32 iChannel;
47910
47914
ma_uint64 interpolatedFrameCount;
47915
+ const ma_uint32 channels = pGainer->config.channels;
47911
47916
47912
47917
MA_ASSERT(pGainer != NULL);
47913
47918
@@ -47947,12 +47952,12 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
47947
47952
float a = (float)pGainer->t / pGainer->config.smoothTimeInFrames;
47948
47953
float d = 1.0f / pGainer->config.smoothTimeInFrames;
47949
47954
47950
- if (pGainer->config. channels <= 32) {
47955
+ if (channels <= 32) {
47951
47956
float pRunningGain[32];
47952
47957
float pRunningGainDelta[32]; /* Could this be heap-allocated as part of the ma_gainer object? */
47953
47958
47954
47959
/* Initialize the running gain. */
47955
- for (iChannel = 0; iChannel < pGainer->config. channels; iChannel += 1) {
47960
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
47956
47961
float t = (pGainer->pOldGains[iChannel] - pGainer->pNewGains[iChannel]) * pGainer->masterVolume;
47957
47962
pRunningGainDelta[iChannel] = t * d;
47958
47963
pRunningGain[iChannel] = (pGainer->pOldGains[iChannel] * pGainer->masterVolume) + (t * a);
@@ -47961,7 +47966,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
47961
47966
iFrame = 0;
47962
47967
47963
47968
/* Optimized paths for common channel counts. This is mostly just experimenting with some SIMD ideas. It's not necessarily final. */
47964
- if (pGainer->config. channels == 2) {
47969
+ if (channels == 2) {
47965
47970
#if defined(MA_SUPPORT_SSE2)
47966
47971
if (ma_has_sse2()) {
47967
47972
ma_uint64 unrolledLoopCount = interpolatedFrameCount >> 1;
@@ -48009,6 +48014,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48009
48014
48010
48015
iFrame = unrolledLoopCount << 1;
48011
48016
#else
48017
+ #pragma clang loop vectorize(enable)
48012
48018
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48013
48019
for (iChannel = 0; iChannel < 2; iChannel += 1) {
48014
48020
pFramesOutF32[iFrame*2 + iChannel] = pFramesInF32[iFrame*2 + iChannel] * pRunningGain[iChannel];
@@ -48020,7 +48026,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48020
48026
}
48021
48027
#endif
48022
48028
}
48023
- } else if (pGainer->config. channels == 6) {
48029
+ } else if (channels == 6) {
48024
48030
#if defined(MA_SUPPORT_SSE2)
48025
48031
if (ma_has_sse2()) {
48026
48032
/*
@@ -48053,6 +48059,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48053
48059
} else
48054
48060
#endif
48055
48061
{
48062
+ #pragma clang loop vectorize(enable)
48056
48063
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48057
48064
for (iChannel = 0; iChannel < 6; iChannel += 1) {
48058
48065
pFramesOutF32[iFrame*6 + iChannel] = pFramesInF32[iFrame*6 + iChannel] * pRunningGain[iChannel];
@@ -48064,7 +48071,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48064
48071
}
48065
48072
}
48066
48073
}
48067
- } else if (pGainer->config. channels == 8) {
48074
+ } else if (channels == 8) {
48068
48075
/* For 8 channels we can just go over frame by frame and do all eight channels as 2 separate 4x SIMD operations. */
48069
48076
#if defined(MA_SUPPORT_SSE2)
48070
48077
if (ma_has_sse2()) {
@@ -48084,6 +48091,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48084
48091
#endif
48085
48092
{
48086
48093
/* This is crafted so that it auto-vectorizes when compiled with Clang. */
48094
+ #pragma clang loop vectorize(enable)
48087
48095
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48088
48096
for (iChannel = 0; iChannel < 8; iChannel += 1) {
48089
48097
pFramesOutF32[iFrame*8 + iChannel] = pFramesInF32[iFrame*8 + iChannel] * pRunningGain[iChannel];
@@ -48097,17 +48105,21 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48097
48105
}
48098
48106
}
48099
48107
48108
+ #pragma clang loop unroll(disable)
48100
48109
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48101
- for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48102
- pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * pRunningGain[iChannel];
48110
+ #pragma clang loop vectorize(enable)
48111
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
48112
+ pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * pRunningGain[iChannel];
48103
48113
pRunningGain[iChannel] += pRunningGainDelta[iChannel];
48104
48114
}
48105
48115
}
48106
48116
} else {
48107
48117
/* Slower path for extreme channel counts where we can't fit enough on the stack. We could also move this to the heap as part of the ma_gainer object which might even be better since it'll only be updated when the gains actually change. */
48118
+ #pragma clang loop unroll(disable)
48108
48119
for (iFrame = 0; iFrame < interpolatedFrameCount; iFrame += 1) {
48109
- for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48110
- pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
48120
+ #pragma clang loop vectorize(enable)
48121
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
48122
+ pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
48111
48123
}
48112
48124
48113
48125
a += d;
@@ -48126,18 +48138,21 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48126
48138
48127
48139
/* All we need to do here is apply the new gains using an optimized path. */
48128
48140
if (pFramesOut != NULL && pFramesIn != NULL) {
48129
- if (pGainer->config. channels <= 32) {
48141
+ if (channels <= 32) {
48130
48142
float gains[32];
48131
- for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48143
+ #pragma clang loop unroll(disable)
48144
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
48132
48145
gains[iChannel] = pGainer->pNewGains[iChannel] * pGainer->masterVolume;
48133
48146
}
48134
48147
48135
- ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, pGainer->config. channels, gains);
48148
+ ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, channels, gains);
48136
48149
} else {
48137
48150
/* Slow path. Too many channels to fit on the stack. Need to apply a master volume as a separate path. */
48151
+ #pragma clang loop unroll(disable)
48138
48152
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
48139
- for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48140
- ((float*)pFramesOut)[iFrame*pGainer->config.channels + iChannel] = ((const float*)pFramesIn)[iFrame*pGainer->config.channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
48153
+ #pragma clang loop vectorize(enable)
48154
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
48155
+ ((float*)pFramesOut)[iFrame*channels + iChannel] = ((const float*)pFramesIn)[iFrame*channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
48141
48156
}
48142
48157
}
48143
48158
}
@@ -50498,7 +50513,7 @@ static void ma_linear_resampler_interpolate_frame_s16(ma_linear_resampler* pResa
50498
50513
50499
50514
a = (pResampler->inTimeFrac << shift) / pResampler->config.sampleRateOut;
50500
50515
50501
- MA_ASSUME(channels > 0 );
50516
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
50502
50517
for (c = 0; c < channels; c += 1) {
50503
50518
ma_int16 s = ma_linear_resampler_mix_s16(pResampler->x0.s16[c], pResampler->x1.s16[c], a, shift);
50504
50519
pFrameOut[c] = s;
@@ -50517,7 +50532,7 @@ static void ma_linear_resampler_interpolate_frame_f32(ma_linear_resampler* pResa
50517
50532
50518
50533
a = (float)pResampler->inTimeFrac / pResampler->config.sampleRateOut;
50519
50534
50520
- MA_ASSUME(channels > 0 );
50535
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
50521
50536
for (c = 0; c < channels; c += 1) {
50522
50537
float s = ma_mix_f32_fast(pResampler->x0.f32[c], pResampler->x1.f32[c], a);
50523
50538
pFrameOut[c] = s;
@@ -51752,6 +51767,7 @@ static void ma_channel_map_apply_shuffle_table_u8(ma_uint8* pFramesOut, ma_uint3
51752
51767
ma_uint64 iFrame;
51753
51768
ma_uint32 iChannelOut;
51754
51769
51770
+ #pragma clang loop unroll(disable)
51755
51771
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
51756
51772
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
51757
51773
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -51772,6 +51788,7 @@ static void ma_channel_map_apply_shuffle_table_s16(ma_int16* pFramesOut, ma_uint
51772
51788
ma_uint64 iFrame;
51773
51789
ma_uint32 iChannelOut;
51774
51790
51791
+ #pragma clang loop unroll(disable)
51775
51792
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
51776
51793
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
51777
51794
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -51814,6 +51831,7 @@ static void ma_channel_map_apply_shuffle_table_s32(ma_int32* pFramesOut, ma_uint
51814
51831
ma_uint64 iFrame;
51815
51832
ma_uint32 iChannelOut;
51816
51833
51834
+ #pragma clang loop unroll(disable)
51817
51835
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
51818
51836
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
51819
51837
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -51834,6 +51852,7 @@ static void ma_channel_map_apply_shuffle_table_f32(float* pFramesOut, ma_uint32
51834
51852
ma_uint64 iFrame;
51835
51853
ma_uint32 iChannelOut;
51836
51854
51855
+ #pragma clang loop unroll(disable)
51837
51856
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
51838
51857
for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
51839
51858
ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
@@ -52068,6 +52087,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
52068
52087
} else
52069
52088
#endif
52070
52089
{
52090
+ #pragma clang loop vectorize(enable)
52071
52091
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52072
52092
for (iChannelOut = 0; iChannelOut < 2; iChannelOut += 1) {
52073
52093
pFramesOut[iFrame*2 + iChannelOut] = pFramesIn[iFrame];
@@ -52095,6 +52115,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
52095
52115
} else
52096
52116
#endif
52097
52117
{
52118
+ #pragma clang loop vectorize(enable)
52098
52119
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52099
52120
for (iChannelOut = 0; iChannelOut < 6; iChannelOut += 1) {
52100
52121
pFramesOut[iFrame*6 + iChannelOut] = pFramesIn[iFrame];
@@ -52112,6 +52133,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
52112
52133
} else
52113
52134
#endif
52114
52135
{
52136
+ #pragma clang loop vectorize(enable)
52115
52137
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52116
52138
for (iChannelOut = 0; iChannelOut < 8; iChannelOut += 1) {
52117
52139
pFramesOut[iFrame*8 + iChannelOut] = pFramesIn[iFrame];
@@ -65280,7 +65302,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__white(ma_noise* pNoise, voi
65280
65302
ma_uint64 iFrame;
65281
65303
ma_uint32 iChannel;
65282
65304
const ma_uint32 channels = pNoise->config.channels;
65283
- MA_ASSUME(channels > 0 );
65305
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
65284
65306
65285
65307
if (pNoise->config.format == ma_format_f32) {
65286
65308
float* pFramesOutF32 = (float*)pFramesOut;
@@ -65399,7 +65421,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__pink(ma_noise* pNoise, void
65399
65421
ma_uint64 iFrame;
65400
65422
ma_uint32 iChannel;
65401
65423
const ma_uint32 channels = pNoise->config.channels;
65402
- MA_ASSUME(channels > 0 );
65424
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
65403
65425
65404
65426
if (pNoise->config.format == ma_format_f32) {
65405
65427
float* pFramesOutF32 = (float*)pFramesOut;
@@ -65481,7 +65503,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__brownian(ma_noise* pNoise,
65481
65503
ma_uint64 iFrame;
65482
65504
ma_uint32 iChannel;
65483
65505
const ma_uint32 channels = pNoise->config.channels;
65484
- MA_ASSUME(channels > 0 );
65506
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
65485
65507
65486
65508
if (pNoise->config.format == ma_format_f32) {
65487
65509
float* pFramesOutF32 = (float*)pFramesOut;
@@ -69687,7 +69709,7 @@ MA_API void ma_debug_fill_pcm_frames_with_sine_wave(float* pFramesOut, ma_uint32
69687
69709
69688
69710
69689
69711
69690
- static ma_result ma_mix_pcm_frames_f32(float* pDst, const float* pSrc, ma_uint64 frameCount, ma_uint32 channels, float volume)
69712
+ static ma_result ma_mix_pcm_frames_f32(float* MA_RESTRICT pDst, const float* MA_RESTRICT pSrc, ma_uint64 frameCount, ma_uint32 channels, float volume)
69691
69713
{
69692
69714
ma_uint64 iSample;
69693
69715
ma_uint64 sampleCount;
@@ -69703,10 +69725,12 @@ static ma_result ma_mix_pcm_frames_f32(float* pDst, const float* pSrc, ma_uint64
69703
69725
sampleCount = frameCount * channels;
69704
69726
69705
69727
if (volume == 1) {
69728
+ #pragma clang loop vectorize(enable)
69706
69729
for (iSample = 0; iSample < sampleCount; iSample += 1) {
69707
69730
pDst[iSample] += pSrc[iSample];
69708
69731
}
69709
69732
} else {
69733
+ #pragma clang loop vectorize(enable)
69710
69734
for (iSample = 0; iSample < sampleCount; iSample += 1) {
69711
69735
pDst[iSample] += ma_apply_volume_unclipped_f32(pSrc[iSample], volume);
69712
69736
}
0 commit comments