@@ -42835,7 +42835,7 @@ MA_API void ma_copy_and_apply_volume_factor_s32(ma_int32* pSamplesOut, const ma_
42835
42835
}
42836
42836
}
42837
42837
42838
- MA_API void ma_copy_and_apply_volume_factor_f32(float* pSamplesOut, const float* pSamplesIn, ma_uint64 sampleCount, float factor)
42838
+ MA_API void ma_copy_and_apply_volume_factor_f32(float* MA_RESTRICT pSamplesOut, const float* MA_RESTRICT pSamplesIn, ma_uint64 sampleCount, float factor)
42839
42839
{
42840
42840
ma_uint64 iSample;
42841
42841
@@ -43130,10 +43130,12 @@ MA_API ma_result ma_mix_pcm_frames_f32(float* pDst, const float* pSrc, ma_uint64
43130
43130
sampleCount = frameCount * channels;
43131
43131
43132
43132
if (volume == 1) {
43133
+ #pragma clang loop vectorize(enable)
43133
43134
for (iSample = 0; iSample < sampleCount; iSample += 1) {
43134
43135
pDst[iSample] += pSrc[iSample];
43135
43136
}
43136
43137
} else {
43138
+ #pragma clang loop vectorize(enable)
43137
43139
for (iSample = 0; iSample < sampleCount; iSample += 1) {
43138
43140
pDst[iSample] += ma_apply_volume_unclipped_f32(pSrc[iSample], volume);
43139
43141
}
@@ -45434,7 +45436,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_f32__direct_form_2_transposed(
45434
45436
const float a1 = pBQ->a1.f32;
45435
45437
const float a2 = pBQ->a2.f32;
45436
45438
45437
- MA_ASSUME(channels > 0);
45439
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45440
+ #pragma clang loop vectorize(assume_safety)
45438
45441
for (c = 0; c < channels; c += 1) {
45439
45442
float r1 = pBQ->pR1[c].f32;
45440
45443
float r2 = pBQ->pR2[c].f32;
@@ -45466,7 +45469,8 @@ static MA_INLINE void ma_biquad_process_pcm_frame_s16__direct_form_2_transposed(
45466
45469
const ma_int32 a1 = pBQ->a1.s32;
45467
45470
const ma_int32 a2 = pBQ->a2.s32;
45468
45471
45469
- MA_ASSUME(channels > 0);
45472
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45473
+ #pragma clang loop vectorize(assume_safety)
45470
45474
for (c = 0; c < channels; c += 1) {
45471
45475
ma_int32 r1 = pBQ->pR1[c].s32;
45472
45476
ma_int32 r2 = pBQ->pR2[c].s32;
@@ -45740,22 +45744,23 @@ MA_API ma_result ma_lpf1_clear_cache(ma_lpf1* pLPF)
45740
45744
return MA_SUCCESS;
45741
45745
}
45742
45746
45743
- static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1* pLPF, float* pY, const float* pX)
45747
+ static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1 * pLPF, float * pY, const float * pX)
45744
45748
{
45745
45749
ma_uint32 c;
45746
45750
const ma_uint32 channels = pLPF->channels;
45747
45751
const float a = pLPF->a.f32;
45748
45752
const float b = 1 - a;
45749
45753
45750
- MA_ASSUME(channels > 0);
45754
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45755
+ #pragma clang loop vectorize(assume_safety)
45751
45756
for (c = 0; c < channels; c += 1) {
45752
45757
float r1 = pLPF->pR1[c].f32;
45753
- float x = pX[c];
45758
+ float x = pX[c];
45754
45759
float y;
45755
45760
45756
- y = b* x + a* r1;
45761
+ y = b * x + a * r1;
45757
45762
45758
- pY[c] = y;
45763
+ pY[c] = y;
45759
45764
pLPF->pR1[c].f32 = y;
45760
45765
}
45761
45766
}
@@ -45767,7 +45772,8 @@ static MA_INLINE void ma_lpf1_process_pcm_frame_s16(ma_lpf1* pLPF, ma_int16* pY,
45767
45772
const ma_int32 a = pLPF->a.s32;
45768
45773
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
45769
45774
45770
- MA_ASSUME(channels > 0);
45775
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
45776
+ #pragma clang loop vectorize(assume_safety)
45771
45777
for (c = 0; c < channels; c += 1) {
45772
45778
ma_int32 r1 = pLPF->pR1[c].s32;
45773
45779
ma_int32 x = pX[c];
@@ -46620,7 +46626,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_f32(ma_hpf1* pHPF, float* pY, co
46620
46626
const float a = 1 - pHPF->a.f32;
46621
46627
const float b = 1 - a;
46622
46628
46623
- MA_ASSUME(channels > 0 );
46629
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
46624
46630
for (c = 0; c < channels; c += 1) {
46625
46631
float r1 = pHPF->pR1[c].f32;
46626
46632
float x = pX[c];
@@ -46640,7 +46646,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_s16(ma_hpf1* pHPF, ma_int16* pY,
46640
46646
const ma_int32 a = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - pHPF->a.s32);
46641
46647
const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
46642
46648
46643
- MA_ASSUME(channels > 0 );
46649
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
46644
46650
for (c = 0; c < channels; c += 1) {
46645
46651
ma_int32 r1 = pHPF->pR1[c].s32;
46646
46652
ma_int32 x = pX[c];
@@ -48748,6 +48754,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48748
48754
ma_uint64 iFrame;
48749
48755
ma_uint32 iChannel;
48750
48756
ma_uint64 interpolatedFrameCount;
48757
+ const ma_uint32 channels = pGainer->config.channels;
48751
48758
48752
48759
MA_ASSERT(pGainer != NULL);
48753
48760
@@ -48787,12 +48794,12 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48787
48794
float a = (float)pGainer->t / pGainer->config.smoothTimeInFrames;
48788
48795
float d = 1.0f / pGainer->config.smoothTimeInFrames;
48789
48796
48790
- if (pGainer->config. channels <= 32) {
48797
+ if (channels <= 32) {
48791
48798
float pRunningGain[32];
48792
48799
float pRunningGainDelta[32]; /* Could this be heap-allocated as part of the ma_gainer object? */
48793
48800
48794
48801
/* Initialize the running gain. */
48795
- for (iChannel = 0; iChannel < pGainer->config. channels; iChannel += 1) {
48802
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
48796
48803
float t = (pGainer->pNewGains[iChannel] - pGainer->pOldGains[iChannel]) * pGainer->masterVolume;
48797
48804
pRunningGainDelta[iChannel] = t * d;
48798
48805
pRunningGain[iChannel] = (pGainer->pOldGains[iChannel] * pGainer->masterVolume) + (t * a);
@@ -48801,7 +48808,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48801
48808
iFrame = 0;
48802
48809
48803
48810
/* Optimized paths for common channel counts. This is mostly just experimenting with some SIMD ideas. It's not necessarily final. */
48804
- if (pGainer->config. channels == 2) {
48811
+ if (channels == 2) {
48805
48812
#if defined(MA_SUPPORT_SSE2)
48806
48813
if (ma_has_sse2()) {
48807
48814
ma_uint64 unrolledLoopCount = interpolatedFrameCount >> 1;
@@ -48849,6 +48856,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48849
48856
48850
48857
iFrame = unrolledLoopCount << 1;
48851
48858
#else
48859
+ #pragma clang loop vectorize(enable)
48852
48860
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48853
48861
for (iChannel = 0; iChannel < 2; iChannel += 1) {
48854
48862
pFramesOutF32[iFrame*2 + iChannel] = pFramesInF32[iFrame*2 + iChannel] * pRunningGain[iChannel];
@@ -48860,7 +48868,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48860
48868
}
48861
48869
#endif
48862
48870
}
48863
- } else if (pGainer->config. channels == 6) {
48871
+ } else if (channels == 6) {
48864
48872
#if defined(MA_SUPPORT_SSE2)
48865
48873
if (ma_has_sse2()) {
48866
48874
/*
@@ -48904,7 +48912,7 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48904
48912
}
48905
48913
}
48906
48914
}
48907
- } else if (pGainer->config. channels == 8) {
48915
+ } else if (channels == 8) {
48908
48916
/* For 8 channels we can just go over frame by frame and do all eight channels as 2 separate 4x SIMD operations. */
48909
48917
#if defined(MA_SUPPORT_SSE2)
48910
48918
if (ma_has_sse2()) {
@@ -48925,29 +48933,35 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48925
48933
{
48926
48934
/* This is crafted so that it auto-vectorizes when compiled with Clang. */
48927
48935
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48936
+ #pragma clang loop vectorize(enable)
48928
48937
for (iChannel = 0; iChannel < 8; iChannel += 1) {
48929
48938
pFramesOutF32[iFrame*8 + iChannel] = pFramesInF32[iFrame*8 + iChannel] * pRunningGain[iChannel];
48930
48939
}
48931
48940
48932
48941
/* Move the running gain forward towards the new gain. */
48942
+ #pragma clang loop vectorize(enable)
48933
48943
for (iChannel = 0; iChannel < 8; iChannel += 1) {
48934
48944
pRunningGain[iChannel] += pRunningGainDelta[iChannel];
48935
48945
}
48936
48946
}
48937
48947
}
48938
48948
}
48939
48949
48950
+ #pragma clang loop unroll(disable)
48940
48951
for (; iFrame < interpolatedFrameCount; iFrame += 1) {
48941
- for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48942
- pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * pRunningGain[iChannel];
48952
+ #pragma clang loop vectorize(enable)
48953
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
48954
+ pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * pRunningGain[iChannel];
48943
48955
pRunningGain[iChannel] += pRunningGainDelta[iChannel];
48944
48956
}
48945
48957
}
48946
48958
} else {
48947
48959
/* Slower path for extreme channel counts where we can't fit enough on the stack. We could also move this to the heap as part of the ma_gainer object which might even be better since it'll only be updated when the gains actually change. */
48960
+ #pragma clang loop unroll(disable)
48948
48961
for (iFrame = 0; iFrame < interpolatedFrameCount; iFrame += 1) {
48949
- for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48950
- pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
48962
+ #pragma clang loop vectorize(enable)
48963
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
48964
+ pFramesOutF32[iFrame*channels + iChannel] = pFramesInF32[iFrame*channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
48951
48965
}
48952
48966
48953
48967
a += d;
@@ -48966,18 +48980,20 @@ static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_inte
48966
48980
48967
48981
/* All we need to do here is apply the new gains using an optimized path. */
48968
48982
if (pFramesOut != NULL && pFramesIn != NULL) {
48969
- if (pGainer->config. channels <= 32) {
48983
+ if (channels <= 32) {
48970
48984
float gains[32];
48971
- for (iChannel = 0; iChannel < pGainer->config. channels; iChannel += 1) {
48985
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
48972
48986
gains[iChannel] = pGainer->pNewGains[iChannel] * pGainer->masterVolume;
48973
48987
}
48974
48988
48975
- ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, pGainer->config. channels, gains);
48989
+ ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, channels, gains);
48976
48990
} else {
48977
48991
/* Slow path. Too many channels to fit on the stack. Need to apply a master volume as a separate path. */
48992
+ #pragma clang loop unroll(disable)
48978
48993
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
48979
- for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
48980
- ((float*)pFramesOut)[iFrame*pGainer->config.channels + iChannel] = ((const float*)pFramesIn)[iFrame*pGainer->config.channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
48994
+ #pragma clang loop vectorize(enable)
48995
+ for (iChannel = 0; iChannel < channels; iChannel += 1) {
48996
+ ((float*)pFramesOut)[iFrame*channels + iChannel] = ((const float*)pFramesIn)[iFrame*channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
48981
48997
}
48982
48998
}
48983
48999
}
@@ -51347,7 +51363,7 @@ static void ma_linear_resampler_interpolate_frame_s16(ma_linear_resampler* pResa
51347
51363
51348
51364
a = (pResampler->inTimeFrac << shift) / pResampler->config.sampleRateOut;
51349
51365
51350
- MA_ASSUME(channels > 0 );
51366
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
51351
51367
for (c = 0; c < channels; c += 1) {
51352
51368
ma_int16 s = ma_linear_resampler_mix_s16(pResampler->x0.s16[c], pResampler->x1.s16[c], a, shift);
51353
51369
pFrameOut[c] = s;
@@ -51366,7 +51382,7 @@ static void ma_linear_resampler_interpolate_frame_f32(ma_linear_resampler* pResa
51366
51382
51367
51383
a = (float)pResampler->inTimeFrac / pResampler->config.sampleRateOut;
51368
51384
51369
- MA_ASSUME(channels > 0 );
51385
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
51370
51386
for (c = 0; c < channels; c += 1) {
51371
51387
float s = ma_mix_f32_fast(pResampler->x0.f32[c], pResampler->x1.f32[c], a);
51372
51388
pFrameOut[c] = s;
@@ -51533,7 +51549,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16(ma_linear_resampler*
51533
51549
}
51534
51550
51535
51551
51536
- static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
51552
+ static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear_resampler* MA_RESTRICT pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
51537
51553
{
51538
51554
const float* pFramesInF32;
51539
51555
/* */ float* pFramesOutF32;
@@ -51559,12 +51575,14 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear
51559
51575
ma_uint32 iChannel;
51560
51576
51561
51577
if (pFramesInF32 != NULL) {
51578
+ #pragma clang loop vectorize(assume_safety)
51562
51579
for (iChannel = 0; iChannel < pResampler->config.channels; iChannel += 1) {
51563
51580
pResampler->x0.f32[iChannel] = pResampler->x1.f32[iChannel];
51564
51581
pResampler->x1.f32[iChannel] = pFramesInF32[iChannel];
51565
51582
}
51566
51583
pFramesInF32 += pResampler->config.channels;
51567
51584
} else {
51585
+ #pragma clang loop vectorize(assume_safety)
51568
51586
for (iChannel = 0; iChannel < pResampler->config.channels; iChannel += 1) {
51569
51587
pResampler->x0.f32[iChannel] = pResampler->x1.f32[iChannel];
51570
51588
pResampler->x1.f32[iChannel] = 0;
@@ -51607,7 +51625,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear
51607
51625
return MA_SUCCESS;
51608
51626
}
51609
51627
51610
- static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
51628
+ static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_resampler* MA_RESTRICT pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
51611
51629
{
51612
51630
const float* pFramesInF32;
51613
51631
/* */ float* pFramesOutF32;
@@ -51633,12 +51651,14 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_r
51633
51651
ma_uint32 iChannel;
51634
51652
51635
51653
if (pFramesInF32 != NULL) {
51654
+ #pragma clang loop vectorize(assume_safety)
51636
51655
for (iChannel = 0; iChannel < pResampler->config.channels; iChannel += 1) {
51637
51656
pResampler->x0.f32[iChannel] = pResampler->x1.f32[iChannel];
51638
51657
pResampler->x1.f32[iChannel] = pFramesInF32[iChannel];
51639
51658
}
51640
51659
pFramesInF32 += pResampler->config.channels;
51641
51660
} else {
51661
+ #pragma clang loop vectorize(assume_safety)
51642
51662
for (iChannel = 0; iChannel < pResampler->config.channels; iChannel += 1) {
51643
51663
pResampler->x0.f32[iChannel] = pResampler->x1.f32[iChannel];
51644
51664
pResampler->x1.f32[iChannel] = 0;
@@ -52918,6 +52938,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
52918
52938
#endif
52919
52939
{
52920
52940
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52941
+ #pragma clang loop vectorize(enable)
52921
52942
for (iChannelOut = 0; iChannelOut < 2; iChannelOut += 1) {
52922
52943
pFramesOut[iFrame*2 + iChannelOut] = pFramesIn[iFrame];
52923
52944
}
@@ -52945,6 +52966,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
52945
52966
#endif
52946
52967
{
52947
52968
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52969
+ #pragma clang loop vectorize(enable)
52948
52970
for (iChannelOut = 0; iChannelOut < 6; iChannelOut += 1) {
52949
52971
pFramesOut[iFrame*6 + iChannelOut] = pFramesIn[iFrame];
52950
52972
}
@@ -52962,6 +52984,7 @@ static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut,
52962
52984
#endif
52963
52985
{
52964
52986
for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
52987
+ #pragma clang loop vectorize(enable)
52965
52988
for (iChannelOut = 0; iChannelOut < 8; iChannelOut += 1) {
52966
52989
pFramesOut[iFrame*8 + iChannelOut] = pFramesIn[iFrame];
52967
52990
}
@@ -66051,7 +66074,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__white(ma_noise* pNoise, voi
66051
66074
ma_uint64 iFrame;
66052
66075
ma_uint32 iChannel;
66053
66076
const ma_uint32 channels = pNoise->config.channels;
66054
- MA_ASSUME(channels > 0 );
66077
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
66055
66078
66056
66079
if (pNoise->config.format == ma_format_f32) {
66057
66080
float* pFramesOutF32 = (float*)pFramesOut;
@@ -66170,7 +66193,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__pink(ma_noise* pNoise, void
66170
66193
ma_uint64 iFrame;
66171
66194
ma_uint32 iChannel;
66172
66195
const ma_uint32 channels = pNoise->config.channels;
66173
- MA_ASSUME(channels > 0 );
66196
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
66174
66197
66175
66198
if (pNoise->config.format == ma_format_f32) {
66176
66199
float* pFramesOutF32 = (float*)pFramesOut;
@@ -66252,7 +66275,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__brownian(ma_noise* pNoise,
66252
66275
ma_uint64 iFrame;
66253
66276
ma_uint32 iChannel;
66254
66277
const ma_uint32 channels = pNoise->config.channels;
66255
- MA_ASSUME(channels > 0 );
66278
+ MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS );
66256
66279
66257
66280
if (pNoise->config.format == ma_format_f32) {
66258
66281
float* pFramesOutF32 = (float*)pFramesOut;
0 commit comments