blob: 1c16bc43a0cd2f75d115c6da870957e687a5a2eb [file] [log] [blame]
Henrik Smiding841920d2016-02-15 16:20:45 +01001/*
2 * Copyright (C) 2016 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_SSE_H
18#define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_SSE_H
19
20namespace android {
21
22// depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h
23
24#if USE_SSE
25
26#define TO_STRING2(x) #x
27#define TO_STRING(x) TO_STRING2(x)
28// uncomment to print GCC version, may be relevant for intrinsic optimizations
29/* #pragma message ("GCC version: " TO_STRING(__GNUC__) \
30 "." TO_STRING(__GNUC_MINOR__) \
31 "." TO_STRING(__GNUC_PATCHLEVEL__)) */
32
33//
34// SSEx specializations are enabled for Process() and ProcessL() in AudioResamplerFirProcess.h
35//
36
37template <int CHANNELS, int STRIDE, bool FIXED>
38static inline void ProcessSSEIntrinsic(float* out,
39 int count,
40 const float* coefsP,
41 const float* coefsN,
42 const float* sP,
43 const float* sN,
44 const float* volumeLR,
45 float lerpP,
46 const float* coefsP1,
47 const float* coefsN1)
48{
49 ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
Glenn Kasten91164e72016-03-15 15:55:01 -070050 static_assert(CHANNELS == 1 || CHANNELS == 2, "CHANNELS must be 1 or 2");
Henrik Smiding841920d2016-02-15 16:20:45 +010051
52 sP -= CHANNELS*(4-1); // adjust sP for a loop iteration of four
53
54 __m128 interp;
55 if (!FIXED) {
56 interp = _mm_set1_ps(lerpP);
57 }
58
59 __m128 accL, accR;
60 accL = _mm_setzero_ps();
61 if (CHANNELS == 2) {
62 accR = _mm_setzero_ps();
63 }
64
65 do {
66 __m128 posCoef = _mm_load_ps(coefsP);
67 __m128 negCoef = _mm_load_ps(coefsN);
68 coefsP += 4;
69 coefsN += 4;
70
71 if (!FIXED) { // interpolate
72 __m128 posCoef1 = _mm_load_ps(coefsP1);
73 __m128 negCoef1 = _mm_load_ps(coefsN1);
74 coefsP1 += 4;
75 coefsN1 += 4;
76
77 // Calculate the final coefficient for interpolation
78 // posCoef = interp * (posCoef1 - posCoef) + posCoef
79 // negCoef = interp * (negCoef - negCoef1) + negCoef1
80 posCoef1 = _mm_sub_ps(posCoef1, posCoef);
81 negCoef = _mm_sub_ps(negCoef, negCoef1);
82
jaishank8070d8d2019-10-09 14:32:42 +053083
84 #if USE_AVX2
85 posCoef = _mm_fmadd_ps(posCoef1, interp, posCoef);
86 negCoef = _mm_fmadd_ps(negCoef, interp, negCoef1);
87 #else
Henrik Smiding841920d2016-02-15 16:20:45 +010088 posCoef1 = _mm_mul_ps(posCoef1, interp);
89 negCoef = _mm_mul_ps(negCoef, interp);
Henrik Smiding841920d2016-02-15 16:20:45 +010090 posCoef = _mm_add_ps(posCoef1, posCoef);
91 negCoef = _mm_add_ps(negCoef, negCoef1);
jaishank8070d8d2019-10-09 14:32:42 +053092 #endif //USE_AVX2
Henrik Smiding841920d2016-02-15 16:20:45 +010093 }
94 switch (CHANNELS) {
95 case 1: {
96 __m128 posSamp = _mm_loadu_ps(sP);
97 __m128 negSamp = _mm_loadu_ps(sN);
98 sP -= 4;
99 sN += 4;
100
101 posSamp = _mm_shuffle_ps(posSamp, posSamp, 0x1B);
jaishank8070d8d2019-10-09 14:32:42 +0530102
103 #if USE_AVX2
104 accL = _mm_fmadd_ps(posSamp, posCoef, accL);
105 accL = _mm_fmadd_ps(negSamp, negCoef, accL);
106 #else
Henrik Smiding841920d2016-02-15 16:20:45 +0100107 posSamp = _mm_mul_ps(posSamp, posCoef);
108 negSamp = _mm_mul_ps(negSamp, negCoef);
Henrik Smiding841920d2016-02-15 16:20:45 +0100109 accL = _mm_add_ps(accL, posSamp);
110 accL = _mm_add_ps(accL, negSamp);
jaishank8070d8d2019-10-09 14:32:42 +0530111 #endif
112
Henrik Smiding841920d2016-02-15 16:20:45 +0100113 } break;
114 case 2: {
115 __m128 posSamp0 = _mm_loadu_ps(sP);
116 __m128 posSamp1 = _mm_loadu_ps(sP+4);
117 __m128 negSamp0 = _mm_loadu_ps(sN);
118 __m128 negSamp1 = _mm_loadu_ps(sN+4);
119 sP -= 8;
120 sN += 8;
121
122 // deinterleave everything and reverse the positives
123 __m128 posSampL = _mm_shuffle_ps(posSamp1, posSamp0, 0x22);
124 __m128 posSampR = _mm_shuffle_ps(posSamp1, posSamp0, 0x77);
125 __m128 negSampL = _mm_shuffle_ps(negSamp0, negSamp1, 0x88);
126 __m128 negSampR = _mm_shuffle_ps(negSamp0, negSamp1, 0xDD);
127
jaishank8070d8d2019-10-09 14:32:42 +0530128 #if USE_AVX2
129 accL = _mm_fmadd_ps(posSampL, posCoef, accL);
130 accR = _mm_fmadd_ps(posSampR, posCoef, accR);
131 accL = _mm_fmadd_ps(negSampL, negCoef, accL);
132 accR = _mm_fmadd_ps(negSampR, negCoef, accR);
133 #else
134 posSampL = _mm_mul_ps(posSampL, posCoef);
135 posSampR = _mm_mul_ps(posSampR, posCoef);
136 negSampL = _mm_mul_ps(negSampL, negCoef);
137 negSampR = _mm_mul_ps(negSampR, negCoef);
Henrik Smiding841920d2016-02-15 16:20:45 +0100138
jaishank8070d8d2019-10-09 14:32:42 +0530139 accL = _mm_add_ps(accL, posSampL);
140 accR = _mm_add_ps(accR, posSampR);
141 accL = _mm_add_ps(accL, negSampL);
142 accR = _mm_add_ps(accR, negSampR);
143 #endif
144
Henrik Smiding841920d2016-02-15 16:20:45 +0100145 } break;
146 }
147 } while (count -= 4);
148
149 // multiply by volume and save
150 __m128 vLR = _mm_setzero_ps();
151 __m128 outSamp;
152 vLR = _mm_loadl_pi(vLR, reinterpret_cast<const __m64*>(volumeLR));
153 outSamp = _mm_loadl_pi(vLR, reinterpret_cast<__m64*>(out));
154
155 // combine and funnel down accumulator
156 __m128 outAccum = _mm_setzero_ps();
157 if (CHANNELS == 1) {
158 // duplicate accL to both L and R
159 outAccum = _mm_add_ps(accL, _mm_movehl_ps(accL, accL));
160 outAccum = _mm_add_ps(outAccum, _mm_shuffle_ps(outAccum, outAccum, 0x11));
161 } else if (CHANNELS == 2) {
162 // accR contains R, fold in
163 outAccum = _mm_hadd_ps(accL, accR);
164 outAccum = _mm_hadd_ps(outAccum, outAccum);
165 }
jaishank8070d8d2019-10-09 14:32:42 +0530166 #if USE_AVX2
167 outSamp = _mm_fmadd_ps(outAccum, vLR,outSamp);
168 #else
Henrik Smiding841920d2016-02-15 16:20:45 +0100169 outAccum = _mm_mul_ps(outAccum, vLR);
170 outSamp = _mm_add_ps(outSamp, outAccum);
jaishank8070d8d2019-10-09 14:32:42 +0530171 #endif
172
Henrik Smiding841920d2016-02-15 16:20:45 +0100173 _mm_storel_pi(reinterpret_cast<__m64*>(out), outSamp);
174}
175
176template<>
177inline void ProcessL<1, 16>(float* const out,
178 int count,
179 const float* coefsP,
180 const float* coefsN,
181 const float* sP,
182 const float* sN,
183 const float* const volumeLR)
184{
185 ProcessSSEIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
186 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
187}
188
189template<>
190inline void ProcessL<2, 16>(float* const out,
191 int count,
192 const float* coefsP,
193 const float* coefsN,
194 const float* sP,
195 const float* sN,
196 const float* const volumeLR)
197{
198 ProcessSSEIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
199 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
200}
201
202template<>
203inline void Process<1, 16>(float* const out,
204 int count,
205 const float* coefsP,
206 const float* coefsN,
207 const float* coefsP1,
208 const float* coefsN1,
209 const float* sP,
210 const float* sN,
211 float lerpP,
212 const float* const volumeLR)
213{
214 ProcessSSEIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
215 lerpP, coefsP1, coefsN1);
216}
217
218template<>
219inline void Process<2, 16>(float* const out,
220 int count,
221 const float* coefsP,
222 const float* coefsN,
223 const float* coefsP1,
224 const float* coefsN1,
225 const float* sP,
226 const float* sN,
227 float lerpP,
228 const float* const volumeLR)
229{
230 ProcessSSEIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
231 lerpP, coefsP1, coefsN1);
232}
233
234#endif //USE_SSE
235
236} // namespace android
237
238#endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_SSE_H*/