blob: f311cef8a92679abe0d8a2f67599298b86cf8ad8 [file] [log] [blame]
Andy Hung86eae0e2013-12-09 12:12:46 -08001/*
2 * Copyright (C) 2013 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
18#define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
19
20namespace android {
21
22// depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h
23
24#if USE_NEON
25//
26// NEON specializations are enabled for Process() and ProcessL()
27//
28// TODO: Stride 16 and Stride 8 can be combined with one pass stride 8 (if necessary)
29// and looping stride 16 (or vice versa). This has some polyphase coef data alignment
30// issues with S16 coefs. Consider this later.
31
32// Macros to save a mono/stereo accumulator sample in q0 (and q4) as stereo out.
33#define ASSEMBLY_ACCUMULATE_MONO \
34 "vld1.s32 {d2}, [%[vLR]:64] \n"/* (1) load volumes */\
35 "vld1.s32 {d3}, %[out] \n"/* (2) unaligned load the output */\
36 "vpadd.s32 d0, d0, d1 \n"/* (1) add all 4 partial sums */\
37 "vpadd.s32 d0, d0, d0 \n"/* (1+4d) and replicate L/R */\
38 "vqrdmulh.s32 d0, d0, d2 \n"/* (2+3d) apply volume */\
39 "vqadd.s32 d3, d3, d0 \n"/* (1+4d) accumulate result (saturating) */\
40 "vst1.s32 {d3}, %[out] \n"/* (2+2d) store result */
41
42#define ASSEMBLY_ACCUMULATE_STEREO \
43 "vld1.s32 {d2}, [%[vLR]:64] \n"/* (1) load volumes*/\
44 "vld1.s32 {d3}, %[out] \n"/* (2) unaligned load the output*/\
45 "vpadd.s32 d0, d0, d1 \n"/* (1) add all 4 partial sums from q0*/\
46 "vpadd.s32 d8, d8, d9 \n"/* (1) add all 4 partial sums from q4*/\
47 "vpadd.s32 d0, d0, d8 \n"/* (1+4d) combine into L/R*/\
48 "vqrdmulh.s32 d0, d0, d2 \n"/* (2+3d) apply volume*/\
49 "vqadd.s32 d3, d3, d0 \n"/* (1+4d) accumulate result (saturating)*/\
50 "vst1.s32 {d3}, %[out] \n"/* (2+2d)store result*/
51
52template <>
53inline void ProcessL<1, 16>(int32_t* const out,
54 int count,
55 const int16_t* coefsP,
56 const int16_t* coefsN,
57 const int16_t* sP,
58 const int16_t* sN,
59 const int32_t* const volumeLR)
60{
61 const int CHANNELS = 1; // template specialization does not preserve params
62 const int STRIDE = 16;
63 sP -= CHANNELS*((STRIDE>>1)-1);
64 asm (
65 "veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0
66
67 "1: \n"
68
69 "vld1.16 {q2}, [%[sP]] \n"// (2+0d) load 8 16-bits mono samples
70 "vld1.16 {q3}, [%[sN]]! \n"// (2) load 8 16-bits mono samples
71 "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs
72 "vld1.16 {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
73
74 "vrev64.16 q2, q2 \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
75
76 // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
77 "vmlal.s16 q0, d4, d17 \n"// (1+0d) multiply (reversed)samples by coef
78 "vmlal.s16 q0, d5, d16 \n"// (1) multiply (reversed)samples by coef
79 "vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples
80 "vmlal.s16 q0, d7, d21 \n"// (1) multiply neg samples
81
82 // moving these ARM instructions before neon above seems to be slower
83 "subs %[count], %[count], #8 \n"// (1) update loop counter
84 "sub %[sP], %[sP], #16 \n"// (0) move pointer to next set of samples
85
86 // sP used after branch (warning)
87 "bne 1b \n"// loop
88
89 ASSEMBLY_ACCUMULATE_MONO
90
91 : [out] "=Uv" (out[0]),
92 [count] "+r" (count),
93 [coefsP0] "+r" (coefsP),
94 [coefsN0] "+r" (coefsN),
95 [sP] "+r" (sP),
96 [sN] "+r" (sN)
97 : [vLR] "r" (volumeLR)
98 : "cc", "memory",
99 "q0", "q1", "q2", "q3",
100 "q8", "q10"
101 );
102}
103
104template <>
105inline void ProcessL<2, 16>(int32_t* const out,
106 int count,
107 const int16_t* coefsP,
108 const int16_t* coefsN,
109 const int16_t* sP,
110 const int16_t* sN,
111 const int32_t* const volumeLR)
112{
113 const int CHANNELS = 2; // template specialization does not preserve params
114 const int STRIDE = 16;
115 sP -= CHANNELS*((STRIDE>>1)-1);
116 asm (
117 "veor q0, q0, q0 \n"// (1) acc_L = 0
118 "veor q4, q4, q4 \n"// (0 combines+) acc_R = 0
119
120 "1: \n"
121
122 "vld2.16 {q2, q3}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo samples
123 "vld2.16 {q5, q6}, [%[sN]]! \n"// (3) load 8 16-bits stereo samples
124 "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs
125 "vld1.16 {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
126
127 "vrev64.16 q2, q2 \n"// (1) reverse 8 frames of the left positive
128 "vrev64.16 q3, q3 \n"// (0 combines+) reverse right positive
129
130 "vmlal.s16 q0, d4, d17 \n"// (1) multiply (reversed) samples left
131 "vmlal.s16 q0, d5, d16 \n"// (1) multiply (reversed) samples left
132 "vmlal.s16 q4, d6, d17 \n"// (1) multiply (reversed) samples right
133 "vmlal.s16 q4, d7, d16 \n"// (1) multiply (reversed) samples right
134 "vmlal.s16 q0, d10, d20 \n"// (1) multiply samples left
135 "vmlal.s16 q0, d11, d21 \n"// (1) multiply samples left
136 "vmlal.s16 q4, d12, d20 \n"// (1) multiply samples right
137 "vmlal.s16 q4, d13, d21 \n"// (1) multiply samples right
138
139 // moving these ARM before neon seems to be slower
140 "subs %[count], %[count], #8 \n"// (1) update loop counter
141 "sub %[sP], %[sP], #32 \n"// (0) move pointer to next set of samples
142
143 // sP used after branch (warning)
144 "bne 1b \n"// loop
145
146 ASSEMBLY_ACCUMULATE_STEREO
147
148 : [out] "=Uv" (out[0]),
149 [count] "+r" (count),
150 [coefsP0] "+r" (coefsP),
151 [coefsN0] "+r" (coefsN),
152 [sP] "+r" (sP),
153 [sN] "+r" (sN)
154 : [vLR] "r" (volumeLR)
155 : "cc", "memory",
156 "q0", "q1", "q2", "q3",
157 "q4", "q5", "q6",
158 "q8", "q10"
159 );
160}
161
162template <>
163inline void Process<1, 16>(int32_t* const out,
164 int count,
165 const int16_t* coefsP,
166 const int16_t* coefsN,
167 const int16_t* coefsP1,
168 const int16_t* coefsN1,
169 const int16_t* sP,
170 const int16_t* sN,
171 uint32_t lerpP,
172 const int32_t* const volumeLR)
173{
174 const int CHANNELS = 1; // template specialization does not preserve params
175 const int STRIDE = 16;
176 sP -= CHANNELS*((STRIDE>>1)-1);
177 asm (
178 "vmov.32 d2[0], %[lerpP] \n"// load the positive phase S32 Q15
179 "veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0
180
181 "1: \n"
182
183 "vld1.16 {q2}, [%[sP]] \n"// (2+0d) load 8 16-bits mono samples
184 "vld1.16 {q3}, [%[sN]]! \n"// (2) load 8 16-bits mono samples
185 "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs
186 "vld1.16 {q9}, [%[coefsP1]:128]! \n"// (1) load 8 16-bits coefs for interpolation
187 "vld1.16 {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
188 "vld1.16 {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation
189
190 "vsub.s16 q9, q9, q8 \n"// (1) interpolate (step1) 1st set of coefs
191 "vsub.s16 q11, q11, q10 \n"// (1) interpolate (step1) 2nd set of coets
192
193 "vqrdmulh.s16 q9, q9, d2[0] \n"// (2) interpolate (step2) 1st set of coefs
194 "vqrdmulh.s16 q11, q11, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs
195
196 "vrev64.16 q2, q2 \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
197
198 "vadd.s16 q8, q8, q9 \n"// (1+2d) interpolate (step3) 1st set
199 "vadd.s16 q10, q10, q11 \n"// (1+1d) interpolate (step3) 2nd set
200
201 // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
202 "vmlal.s16 q0, d4, d17 \n"// (1+0d) multiply reversed samples by coef
203 "vmlal.s16 q0, d5, d16 \n"// (1) multiply reversed samples by coef
204 "vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples
205 "vmlal.s16 q0, d7, d21 \n"// (1) multiply neg samples
206
207 // moving these ARM instructions before neon above seems to be slower
208 "subs %[count], %[count], #8 \n"// (1) update loop counter
209 "sub %[sP], %[sP], #16 \n"// (0) move pointer to next set of samples
210
211 // sP used after branch (warning)
212 "bne 1b \n"// loop
213
214 ASSEMBLY_ACCUMULATE_MONO
215
216 : [out] "=Uv" (out[0]),
217 [count] "+r" (count),
218 [coefsP0] "+r" (coefsP),
219 [coefsN0] "+r" (coefsN),
220 [coefsP1] "+r" (coefsP1),
221 [coefsN1] "+r" (coefsN1),
222 [sP] "+r" (sP),
223 [sN] "+r" (sN)
224 : [lerpP] "r" (lerpP),
225 [vLR] "r" (volumeLR)
226 : "cc", "memory",
227 "q0", "q1", "q2", "q3",
228 "q8", "q9", "q10", "q11"
229 );
230}
231
232template <>
233inline void Process<2, 16>(int32_t* const out,
234 int count,
235 const int16_t* coefsP,
236 const int16_t* coefsN,
237 const int16_t* coefsP1,
238 const int16_t* coefsN1,
239 const int16_t* sP,
240 const int16_t* sN,
241 uint32_t lerpP,
242 const int32_t* const volumeLR)
243{
244 const int CHANNELS = 2; // template specialization does not preserve params
245 const int STRIDE = 16;
246 sP -= CHANNELS*((STRIDE>>1)-1);
247 asm (
248 "vmov.32 d2[0], %[lerpP] \n"// load the positive phase
249 "veor q0, q0, q0 \n"// (1) acc_L = 0
250 "veor q4, q4, q4 \n"// (0 combines+) acc_R = 0
251
252 "1: \n"
253
254 "vld2.16 {q2, q3}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo samples
255 "vld2.16 {q5, q6}, [%[sN]]! \n"// (3) load 8 16-bits stereo samples
256 "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs
257 "vld1.16 {q9}, [%[coefsP1]:128]! \n"// (1) load 8 16-bits coefs for interpolation
258 "vld1.16 {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
259 "vld1.16 {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation
260
261 "vsub.s16 q9, q9, q8 \n"// (1) interpolate (step1) 1st set of coefs
262 "vsub.s16 q11, q11, q10 \n"// (1) interpolate (step1) 2nd set of coets
263
264 "vqrdmulh.s16 q9, q9, d2[0] \n"// (2) interpolate (step2) 1st set of coefs
265 "vqrdmulh.s16 q11, q11, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs
266
267 "vrev64.16 q2, q2 \n"// (1) reverse 8 frames of the left positive
268 "vrev64.16 q3, q3 \n"// (1) reverse 8 frames of the right positive
269
270 "vadd.s16 q8, q8, q9 \n"// (1+1d) interpolate (step3) 1st set
271 "vadd.s16 q10, q10, q11 \n"// (1+1d) interpolate (step3) 2nd set
272
273 "vmlal.s16 q0, d4, d17 \n"// (1) multiply reversed samples left
274 "vmlal.s16 q0, d5, d16 \n"// (1) multiply reversed samples left
275 "vmlal.s16 q4, d6, d17 \n"// (1) multiply reversed samples right
276 "vmlal.s16 q4, d7, d16 \n"// (1) multiply reversed samples right
277 "vmlal.s16 q0, d10, d20 \n"// (1) multiply samples left
278 "vmlal.s16 q0, d11, d21 \n"// (1) multiply samples left
279 "vmlal.s16 q4, d12, d20 \n"// (1) multiply samples right
280 "vmlal.s16 q4, d13, d21 \n"// (1) multiply samples right
281
282 // moving these ARM before neon seems to be slower
283 "subs %[count], %[count], #8 \n"// (1) update loop counter
284 "sub %[sP], %[sP], #32 \n"// (0) move pointer to next set of samples
285
286 // sP used after branch (warning)
287 "bne 1b \n"// loop
288
289 ASSEMBLY_ACCUMULATE_STEREO
290
291 : [out] "=Uv" (out[0]),
292 [count] "+r" (count),
293 [coefsP0] "+r" (coefsP),
294 [coefsN0] "+r" (coefsN),
295 [coefsP1] "+r" (coefsP1),
296 [coefsN1] "+r" (coefsN1),
297 [sP] "+r" (sP),
298 [sN] "+r" (sN)
299 : [lerpP] "r" (lerpP),
300 [vLR] "r" (volumeLR)
301 : "cc", "memory",
302 "q0", "q1", "q2", "q3",
303 "q4", "q5", "q6",
304 "q8", "q9", "q10", "q11"
305 );
306}
307
308template <>
309inline void ProcessL<1, 16>(int32_t* const out,
310 int count,
311 const int32_t* coefsP,
312 const int32_t* coefsN,
313 const int16_t* sP,
314 const int16_t* sN,
315 const int32_t* const volumeLR)
316{
317 const int CHANNELS = 1; // template specialization does not preserve params
318 const int STRIDE = 16;
319 sP -= CHANNELS*((STRIDE>>1)-1);
320 asm (
321 "veor q0, q0, q0 \n"// result, initialize to 0
322
323 "1: \n"
324
325 "vld1.16 {q2}, [%[sP]] \n"// load 8 16-bits mono samples
326 "vld1.16 {q3}, [%[sN]]! \n"// load 8 16-bits mono samples
327 "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs
328 "vld1.32 {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
329
330 "vrev64.16 q2, q2 \n"// reverse 8 frames of the positive side
331
332 "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
333 "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
334
335 "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits
336 "vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits
337
338 "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef
339 "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef
340 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
341 "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef
342
343 "vadd.s32 q0, q0, q12 \n"// accumulate result
344 "vadd.s32 q13, q13, q14 \n"// accumulate result
345 "vadd.s32 q0, q0, q15 \n"// accumulate result
346 "vadd.s32 q0, q0, q13 \n"// accumulate result
347
348 "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples
349 "subs %[count], %[count], #8 \n"// update loop counter
350
351 "bne 1b \n"// loop
352
353 ASSEMBLY_ACCUMULATE_MONO
354
355 : [out] "=Uv" (out[0]),
356 [count] "+r" (count),
357 [coefsP0] "+r" (coefsP),
358 [coefsN0] "+r" (coefsN),
359 [sP] "+r" (sP),
360 [sN] "+r" (sN)
361 : [vLR] "r" (volumeLR)
362 : "cc", "memory",
363 "q0", "q1", "q2", "q3",
364 "q8", "q9", "q10", "q11",
365 "q12", "q13", "q14", "q15"
366 );
367}
368
369template <>
370inline void ProcessL<2, 16>(int32_t* const out,
371 int count,
372 const int32_t* coefsP,
373 const int32_t* coefsN,
374 const int16_t* sP,
375 const int16_t* sN,
376 const int32_t* const volumeLR)
377{
378 const int CHANNELS = 2; // template specialization does not preserve params
379 const int STRIDE = 16;
380 sP -= CHANNELS*((STRIDE>>1)-1);
381 asm (
382 "veor q0, q0, q0 \n"// result, initialize to 0
383 "veor q4, q4, q4 \n"// result, initialize to 0
384
385 "1: \n"
386
387 "vld2.16 {q2, q3}, [%[sP]] \n"// load 4 16-bits stereo samples
388 "vld2.16 {q5, q6}, [%[sN]]! \n"// load 4 16-bits stereo samples
389 "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs
390 "vld1.32 {q10, q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs
391
392 "vrev64.16 q2, q2 \n"// reverse 8 frames of the positive side
393 "vrev64.16 q3, q3 \n"// reverse 8 frames of the positive side
394
395 "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
396 "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
397
398 "vshll.s16 q14, d10, #15 \n"// extend samples to 31 bits
399 "vshll.s16 q15, d11, #15 \n"// extend samples to 31 bits
400
401 "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef
402 "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef
403 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
404 "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef
405
406 "vadd.s32 q0, q0, q12 \n"// accumulate result
407 "vadd.s32 q13, q13, q14 \n"// accumulate result
408 "vadd.s32 q0, q0, q15 \n"// (+1) accumulate result
409 "vadd.s32 q0, q0, q13 \n"// (+1) accumulate result
410
411 "vshll.s16 q12, d6, #15 \n"// extend samples to 31 bits
412 "vshll.s16 q13, d7, #15 \n"// extend samples to 31 bits
413
414 "vshll.s16 q14, d12, #15 \n"// extend samples to 31 bits
415 "vshll.s16 q15, d13, #15 \n"// extend samples to 31 bits
416
417 "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef
418 "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef
419 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
420 "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef
421
422 "vadd.s32 q4, q4, q12 \n"// accumulate result
423 "vadd.s32 q13, q13, q14 \n"// accumulate result
424 "vadd.s32 q4, q4, q15 \n"// (+1) accumulate result
425 "vadd.s32 q4, q4, q13 \n"// (+1) accumulate result
426
427 "subs %[count], %[count], #8 \n"// update loop counter
428 "sub %[sP], %[sP], #32 \n"// move pointer to next set of samples
429
430 "bne 1b \n"// loop
431
432 ASSEMBLY_ACCUMULATE_STEREO
433
434 : [out] "=Uv" (out[0]),
435 [count] "+r" (count),
436 [coefsP0] "+r" (coefsP),
437 [coefsN0] "+r" (coefsN),
438 [sP] "+r" (sP),
439 [sN] "+r" (sN)
440 : [vLR] "r" (volumeLR)
441 : "cc", "memory",
442 "q0", "q1", "q2", "q3",
443 "q4", "q5", "q6",
444 "q8", "q9", "q10", "q11",
445 "q12", "q13", "q14", "q15"
446 );
447}
448
449template <>
450inline void Process<1, 16>(int32_t* const out,
451 int count,
452 const int32_t* coefsP,
453 const int32_t* coefsN,
454 const int32_t* coefsP1,
455 const int32_t* coefsN1,
456 const int16_t* sP,
457 const int16_t* sN,
458 uint32_t lerpP,
459 const int32_t* const volumeLR)
460{
461 const int CHANNELS = 1; // template specialization does not preserve params
462 const int STRIDE = 16;
463 sP -= CHANNELS*((STRIDE>>1)-1);
464 asm (
465 "vmov.32 d2[0], %[lerpP] \n"// load the positive phase
466 "veor q0, q0, q0 \n"// result, initialize to 0
467
468 "1: \n"
469
470 "vld1.16 {q2}, [%[sP]] \n"// load 8 16-bits mono samples
471 "vld1.16 {q3}, [%[sN]]! \n"// load 8 16-bits mono samples
472 "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs
473 "vld1.32 {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
474 "vld1.32 {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
475 "vld1.32 {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
476
477 "vsub.s32 q12, q12, q8 \n"// interpolate (step1)
478 "vsub.s32 q13, q13, q9 \n"// interpolate (step1)
479 "vsub.s32 q14, q14, q10 \n"// interpolate (step1)
480 "vsub.s32 q15, q15, q11 \n"// interpolate (step1)
481
482 "vqrdmulh.s32 q12, q12, d2[0] \n"// interpolate (step2)
483 "vqrdmulh.s32 q13, q13, d2[0] \n"// interpolate (step2)
484 "vqrdmulh.s32 q14, q14, d2[0] \n"// interpolate (step2)
485 "vqrdmulh.s32 q15, q15, d2[0] \n"// interpolate (step2)
486
487 "vadd.s32 q8, q8, q12 \n"// interpolate (step3)
488 "vadd.s32 q9, q9, q13 \n"// interpolate (step3)
489 "vadd.s32 q10, q10, q14 \n"// interpolate (step3)
490 "vadd.s32 q11, q11, q15 \n"// interpolate (step3)
491
492 "vrev64.16 q2, q2 \n"// reverse 8 frames of the positive side
493
494 "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
495 "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
496
497 "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits
498 "vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits
499
500 "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef
501 "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef
502 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
503 "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef
504
505 "vadd.s32 q0, q0, q12 \n"// accumulate result
506 "vadd.s32 q13, q13, q14 \n"// accumulate result
507 "vadd.s32 q0, q0, q15 \n"// accumulate result
508 "vadd.s32 q0, q0, q13 \n"// accumulate result
509
510 "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples
511 "subs %[count], %[count], #8 \n"// update loop counter
512
513 "bne 1b \n"// loop
514
515 ASSEMBLY_ACCUMULATE_MONO
516
517 : [out] "=Uv" (out[0]),
518 [count] "+r" (count),
519 [coefsP0] "+r" (coefsP),
520 [coefsN0] "+r" (coefsN),
521 [coefsP1] "+r" (coefsP1),
522 [coefsN1] "+r" (coefsN1),
523 [sP] "+r" (sP),
524 [sN] "+r" (sN)
525 : [lerpP] "r" (lerpP),
526 [vLR] "r" (volumeLR)
527 : "cc", "memory",
528 "q0", "q1", "q2", "q3",
529 "q8", "q9", "q10", "q11",
530 "q12", "q13", "q14", "q15"
531 );
532}
533
534template <>
535inline void Process<2, 16>(int32_t* const out,
536 int count,
537 const int32_t* coefsP,
538 const int32_t* coefsN,
539 const int32_t* coefsP1,
540 const int32_t* coefsN1,
541 const int16_t* sP,
542 const int16_t* sN,
543 uint32_t lerpP,
544 const int32_t* const volumeLR)
545{
546 const int CHANNELS = 2; // template specialization does not preserve params
547 const int STRIDE = 16;
548 sP -= CHANNELS*((STRIDE>>1)-1);
549 asm (
550 "vmov.32 d2[0], %[lerpP] \n"// load the positive phase
551 "veor q0, q0, q0 \n"// result, initialize to 0
552 "veor q4, q4, q4 \n"// result, initialize to 0
553
554 "1: \n"
555
556 "vld2.16 {q2, q3}, [%[sP]] \n"// load 4 16-bits stereo samples
557 "vld2.16 {q5, q6}, [%[sN]]! \n"// load 4 16-bits stereo samples
558 "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs
559 "vld1.32 {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
560 "vld1.32 {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
561 "vld1.32 {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
562
563 "vsub.s32 q12, q12, q8 \n"// interpolate (step1)
564 "vsub.s32 q13, q13, q9 \n"// interpolate (step1)
565 "vsub.s32 q14, q14, q10 \n"// interpolate (step1)
566 "vsub.s32 q15, q15, q11 \n"// interpolate (step1)
567
568 "vqrdmulh.s32 q12, q12, d2[0] \n"// interpolate (step2)
569 "vqrdmulh.s32 q13, q13, d2[0] \n"// interpolate (step2)
570 "vqrdmulh.s32 q14, q14, d2[0] \n"// interpolate (step2)
571 "vqrdmulh.s32 q15, q15, d2[0] \n"// interpolate (step2)
572
573 "vadd.s32 q8, q8, q12 \n"// interpolate (step3)
574 "vadd.s32 q9, q9, q13 \n"// interpolate (step3)
575 "vadd.s32 q10, q10, q14 \n"// interpolate (step3)
576 "vadd.s32 q11, q11, q15 \n"// interpolate (step3)
577
578 "vrev64.16 q2, q2 \n"// reverse 8 frames of the positive side
579 "vrev64.16 q3, q3 \n"// reverse 8 frames of the positive side
580
581 "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
582 "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
583
584 "vshll.s16 q14, d10, #15 \n"// extend samples to 31 bits
585 "vshll.s16 q15, d11, #15 \n"// extend samples to 31 bits
586
587 "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef
588 "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef
589 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
590 "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef
591
592 "vadd.s32 q0, q0, q12 \n"// accumulate result
593 "vadd.s32 q13, q13, q14 \n"// accumulate result
594 "vadd.s32 q0, q0, q15 \n"// (+1) accumulate result
595 "vadd.s32 q0, q0, q13 \n"// (+1) accumulate result
596
597 "vshll.s16 q12, d6, #15 \n"// extend samples to 31 bits
598 "vshll.s16 q13, d7, #15 \n"// extend samples to 31 bits
599
600 "vshll.s16 q14, d12, #15 \n"// extend samples to 31 bits
601 "vshll.s16 q15, d13, #15 \n"// extend samples to 31 bits
602
603 "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef
604 "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef
605 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
606 "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef
607
608 "vadd.s32 q4, q4, q12 \n"// accumulate result
609 "vadd.s32 q13, q13, q14 \n"// accumulate result
610 "vadd.s32 q4, q4, q15 \n"// (+1) accumulate result
611 "vadd.s32 q4, q4, q13 \n"// (+1) accumulate result
612
613 "subs %[count], %[count], #8 \n"// update loop counter
614 "sub %[sP], %[sP], #32 \n"// move pointer to next set of samples
615
616 "bne 1b \n"// loop
617
618 ASSEMBLY_ACCUMULATE_STEREO
619
620 : [out] "=Uv" (out[0]),
621 [count] "+r" (count),
622 [coefsP0] "+r" (coefsP),
623 [coefsN0] "+r" (coefsN),
624 [coefsP1] "+r" (coefsP1),
625 [coefsN1] "+r" (coefsN1),
626 [sP] "+r" (sP),
627 [sN] "+r" (sN)
628 : [lerpP] "r" (lerpP),
629 [vLR] "r" (volumeLR)
630 : "cc", "memory",
631 "q0", "q1", "q2", "q3",
632 "q4", "q5", "q6",
633 "q8", "q9", "q10", "q11",
634 "q12", "q13", "q14", "q15"
635 );
636}
637
638template <>
639inline void ProcessL<1, 8>(int32_t* const out,
640 int count,
641 const int16_t* coefsP,
642 const int16_t* coefsN,
643 const int16_t* sP,
644 const int16_t* sN,
645 const int32_t* const volumeLR)
646{
647 const int CHANNELS = 1; // template specialization does not preserve params
648 const int STRIDE = 8;
649 sP -= CHANNELS*((STRIDE>>1)-1);
650 asm (
651 "veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0
652
653 "1: \n"
654
655 "vld1.16 {d4}, [%[sP]] \n"// (2+0d) load 4 16-bits mono samples
656 "vld1.16 {d6}, [%[sN]]! \n"// (2) load 4 16-bits mono samples
657 "vld1.16 {d16}, [%[coefsP0]:64]! \n"// (1) load 4 16-bits coefs
658 "vld1.16 {d20}, [%[coefsN0]:64]! \n"// (1) load 4 16-bits coefs
659
660 "vrev64.16 d4, d4 \n"// (1) reversed s3, s2, s1, s0, s7, s6, s5, s4
661
662 // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
663 "vmlal.s16 q0, d4, d16 \n"// (1) multiply (reversed)samples by coef
664 "vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples
665
666 // moving these ARM instructions before neon above seems to be slower
667 "subs %[count], %[count], #4 \n"// (1) update loop counter
668 "sub %[sP], %[sP], #8 \n"// (0) move pointer to next set of samples
669
670 // sP used after branch (warning)
671 "bne 1b \n"// loop
672
673 ASSEMBLY_ACCUMULATE_MONO
674
675 : [out] "=Uv" (out[0]),
676 [count] "+r" (count),
677 [coefsP0] "+r" (coefsP),
678 [coefsN0] "+r" (coefsN),
679 [sP] "+r" (sP),
680 [sN] "+r" (sN)
681 : [vLR] "r" (volumeLR)
682 : "cc", "memory",
683 "q0", "q1", "q2", "q3",
684 "q8", "q10"
685 );
686}
687
688template <>
689inline void ProcessL<2, 8>(int32_t* const out,
690 int count,
691 const int16_t* coefsP,
692 const int16_t* coefsN,
693 const int16_t* sP,
694 const int16_t* sN,
695 const int32_t* const volumeLR)
696{
697 const int CHANNELS = 2; // template specialization does not preserve params
698 const int STRIDE = 8;
699 sP -= CHANNELS*((STRIDE>>1)-1);
700 asm (
701 "veor q0, q0, q0 \n"// (1) acc_L = 0
702 "veor q4, q4, q4 \n"// (0 combines+) acc_R = 0
703
704 "1: \n"
705
706 "vld2.16 {d4, d5}, [%[sP]] \n"// (2+0d) load 8 16-bits stereo samples
707 "vld2.16 {d6, d7}, [%[sN]]! \n"// (2) load 8 16-bits stereo samples
708 "vld1.16 {d16}, [%[coefsP0]:64]! \n"// (1) load 8 16-bits coefs
709 "vld1.16 {d20}, [%[coefsN0]:64]! \n"// (1) load 8 16-bits coefs
710
711 "vrev64.16 q2, q2 \n"// (1) reverse 8 frames of the left positive
712
713 "vmlal.s16 q0, d4, d16 \n"// (1) multiply (reversed) samples left
714 "vmlal.s16 q4, d5, d16 \n"// (1) multiply (reversed) samples right
715 "vmlal.s16 q0, d6, d20 \n"// (1) multiply samples left
716 "vmlal.s16 q4, d7, d20 \n"// (1) multiply samples right
717
718 // moving these ARM before neon seems to be slower
719 "subs %[count], %[count], #4 \n"// (1) update loop counter
720 "sub %[sP], %[sP], #16 \n"// (0) move pointer to next set of samples
721
722 // sP used after branch (warning)
723 "bne 1b \n"// loop
724
725 ASSEMBLY_ACCUMULATE_STEREO
726
727 : [out] "=Uv" (out[0]),
728 [count] "+r" (count),
729 [coefsP0] "+r" (coefsP),
730 [coefsN0] "+r" (coefsN),
731 [sP] "+r" (sP),
732 [sN] "+r" (sN)
733 : [vLR] "r" (volumeLR)
734 : "cc", "memory",
735 "q0", "q1", "q2", "q3",
736 "q4", "q5", "q6",
737 "q8", "q10"
738 );
739}
740
741template <>
742inline void Process<1, 8>(int32_t* const out,
743 int count,
744 const int16_t* coefsP,
745 const int16_t* coefsN,
746 const int16_t* coefsP1,
747 const int16_t* coefsN1,
748 const int16_t* sP,
749 const int16_t* sN,
750 uint32_t lerpP,
751 const int32_t* const volumeLR)
752{
753 const int CHANNELS = 1; // template specialization does not preserve params
754 const int STRIDE = 8;
755 sP -= CHANNELS*((STRIDE>>1)-1);
756 asm (
757 "vmov.32 d2[0], %[lerpP] \n"// load the positive phase S32 Q15
758 "veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0
759
760 "1: \n"
761
762 "vld1.16 {d4}, [%[sP]] \n"// (2+0d) load 4 16-bits mono samples
763 "vld1.16 {d6}, [%[sN]]! \n"// (2) load 4 16-bits mono samples
764 "vld1.16 {d16}, [%[coefsP0]:64]! \n"// (1) load 4 16-bits coefs
765 "vld1.16 {d17}, [%[coefsP1]:64]! \n"// (1) load 4 16-bits coefs for interpolation
766 "vld1.16 {d20}, [%[coefsN1]:64]! \n"// (1) load 4 16-bits coefs
767 "vld1.16 {d21}, [%[coefsN0]:64]! \n"// (1) load 4 16-bits coefs for interpolation
768
769 "vsub.s16 d17, d17, d16 \n"// (1) interpolate (step1) 1st set of coefs
770 "vsub.s16 d21, d21, d20 \n"// (1) interpolate (step1) 2nd set of coets
771
772 "vqrdmulh.s16 d17, d17, d2[0] \n"// (2) interpolate (step2) 1st set of coefs
773 "vqrdmulh.s16 d21, d21, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs
774
775 "vrev64.16 d4, d4 \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
776
777 "vadd.s16 d16, d16, d17 \n"// (1+2d) interpolate (step3) 1st set
778 "vadd.s16 d20, d20, d21 \n"// (1+1d) interpolate (step3) 2nd set
779
780 // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
781 "vmlal.s16 q0, d4, d16 \n"// (1+0d) multiply (reversed)by coef
782 "vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples
783
784 // moving these ARM instructions before neon above seems to be slower
785 "subs %[count], %[count], #4 \n"// (1) update loop counter
786 "sub %[sP], %[sP], #8 \n"// move pointer to next set of samples
787
788 // sP used after branch (warning)
789 "bne 1b \n"// loop
790
791 ASSEMBLY_ACCUMULATE_MONO
792
793 : [out] "=Uv" (out[0]),
794 [count] "+r" (count),
795 [coefsP0] "+r" (coefsP),
796 [coefsN0] "+r" (coefsN),
797 [coefsP1] "+r" (coefsP1),
798 [coefsN1] "+r" (coefsN1),
799 [sP] "+r" (sP),
800 [sN] "+r" (sN)
801 : [lerpP] "r" (lerpP),
802 [vLR] "r" (volumeLR)
803 : "cc", "memory",
804 "q0", "q1", "q2", "q3",
805 "q8", "q9", "q10", "q11"
806 );
807}
808
809template <>
810inline void Process<2, 8>(int32_t* const out,
811 int count,
812 const int16_t* coefsP,
813 const int16_t* coefsN,
814 const int16_t* coefsP1,
815 const int16_t* coefsN1,
816 const int16_t* sP,
817 const int16_t* sN,
818 uint32_t lerpP,
819 const int32_t* const volumeLR)
820{
821 const int CHANNELS = 2; // template specialization does not preserve params
822 const int STRIDE = 8;
823 sP -= CHANNELS*((STRIDE>>1)-1);
824 asm (
825 "vmov.32 d2[0], %[lerpP] \n"// load the positive phase
826 "veor q0, q0, q0 \n"// (1) acc_L = 0
827 "veor q4, q4, q4 \n"// (0 combines+) acc_R = 0
828
829 "1: \n"
830
831 "vld2.16 {d4, d5}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo samples
832 "vld2.16 {d6, d7}, [%[sN]]! \n"// (3) load 8 16-bits stereo samples
833 "vld1.16 {d16}, [%[coefsP0]:64]! \n"// (1) load 8 16-bits coefs
834 "vld1.16 {d17}, [%[coefsP1]:64]! \n"// (1) load 8 16-bits coefs for interpolation
835 "vld1.16 {d20}, [%[coefsN1]:64]! \n"// (1) load 8 16-bits coefs
836 "vld1.16 {d21}, [%[coefsN0]:64]! \n"// (1) load 8 16-bits coefs for interpolation
837
838 "vsub.s16 d17, d17, d16 \n"// (1) interpolate (step1) 1st set of coefs
839 "vsub.s16 d21, d21, d20 \n"// (1) interpolate (step1) 2nd set of coets
840
841 "vqrdmulh.s16 d17, d17, d2[0] \n"// (2) interpolate (step2) 1st set of coefs
842 "vqrdmulh.s16 d21, d21, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs
843
844 "vrev64.16 q2, q2 \n"// (1) reverse 8 frames of the left positive
845
846 "vadd.s16 d16, d16, d17 \n"// (1+1d) interpolate (step3) 1st set
847 "vadd.s16 d20, d20, d21 \n"// (1+1d) interpolate (step3) 2nd set
848
849 "vmlal.s16 q0, d4, d16 \n"// (1) multiply (reversed) samples left
850 "vmlal.s16 q4, d5, d16 \n"// (1) multiply (reversed) samples right
851 "vmlal.s16 q0, d6, d20 \n"// (1) multiply samples left
852 "vmlal.s16 q4, d7, d20 \n"// (1) multiply samples right
853
854 // moving these ARM before neon seems to be slower
855 "subs %[count], %[count], #4 \n"// (1) update loop counter
856 "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples
857
858 // sP used after branch (warning)
859 "bne 1b \n"// loop
860
861 ASSEMBLY_ACCUMULATE_STEREO
862
863 : [out] "=Uv" (out[0]),
864 [count] "+r" (count),
865 [coefsP0] "+r" (coefsP),
866 [coefsN0] "+r" (coefsN),
867 [coefsP1] "+r" (coefsP1),
868 [coefsN1] "+r" (coefsN1),
869 [sP] "+r" (sP),
870 [sN] "+r" (sN)
871 : [lerpP] "r" (lerpP),
872 [vLR] "r" (volumeLR)
873 : "cc", "memory",
874 "q0", "q1", "q2", "q3",
875 "q4", "q5", "q6",
876 "q8", "q9", "q10", "q11"
877 );
878}
879
880template <>
881inline void ProcessL<1, 8>(int32_t* const out,
882 int count,
883 const int32_t* coefsP,
884 const int32_t* coefsN,
885 const int16_t* sP,
886 const int16_t* sN,
887 const int32_t* const volumeLR)
888{
889 const int CHANNELS = 1; // template specialization does not preserve params
890 const int STRIDE = 8;
891 sP -= CHANNELS*((STRIDE>>1)-1);
892 asm (
893 "veor q0, q0, q0 \n"// result, initialize to 0
894
895 "1: \n"
896
897 "vld1.16 {d4}, [%[sP]] \n"// load 4 16-bits mono samples
898 "vld1.16 {d6}, [%[sN]]! \n"// load 4 16-bits mono samples
899 "vld1.32 {q8}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs
900 "vld1.32 {q10}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs
901
902 "vrev64.16 d4, d4 \n"// reverse 2 frames of the positive side
903
904 "vshll.s16 q12, d4, #15 \n"// (stall) extend samples to 31 bits
905 "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits
906
907 "vqrdmulh.s32 q12, q12, q8 \n"// multiply samples by interpolated coef
908 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
909
910 "vadd.s32 q0, q0, q12 \n"// accumulate result
911 "vadd.s32 q0, q0, q14 \n"// (stall) accumulate result
912
913 "subs %[count], %[count], #4 \n"// update loop counter
914 "sub %[sP], %[sP], #8 \n"// move pointer to next set of samples
915
916 "bne 1b \n"// loop
917
918 ASSEMBLY_ACCUMULATE_MONO
919
920 : [out] "=Uv" (out[0]),
921 [count] "+r" (count),
922 [coefsP0] "+r" (coefsP),
923 [coefsN0] "+r" (coefsN),
924 [sP] "+r" (sP),
925 [sN] "+r" (sN)
926 : [vLR] "r" (volumeLR)
927 : "cc", "memory",
928 "q0", "q1", "q2", "q3",
929 "q8", "q9", "q10", "q11",
930 "q12", "q14"
931 );
932}
933
934template <>
935inline void ProcessL<2, 8>(int32_t* const out,
936 int count,
937 const int32_t* coefsP,
938 const int32_t* coefsN,
939 const int16_t* sP,
940 const int16_t* sN,
941 const int32_t* const volumeLR)
942{
943 const int CHANNELS = 2; // template specialization does not preserve params
944 const int STRIDE = 8;
945 sP -= CHANNELS*((STRIDE>>1)-1);
946 asm (
947 "veor q0, q0, q0 \n"// result, initialize to 0
948 "veor q4, q4, q4 \n"// result, initialize to 0
949
950 "1: \n"
951
952 "vld2.16 {d4, d5}, [%[sP]] \n"// load 4 16-bits stereo samples
953 "vld2.16 {d6, d7}, [%[sN]]! \n"// load 4 16-bits stereo samples
954 "vld1.32 {q8}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs
955 "vld1.32 {q10}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs
956
957 "vrev64.16 q2, q2 \n"// reverse 2 frames of the positive side
958
959 "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
960 "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
961
962 "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits
963 "vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits
964
965 "vqrdmulh.s32 q12, q12, q8 \n"// multiply samples by coef
966 "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by coef
967 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by coef
968 "vqrdmulh.s32 q15, q15, q10 \n"// multiply samples by coef
969
970 "vadd.s32 q0, q0, q12 \n"// accumulate result
971 "vadd.s32 q4, q4, q13 \n"// accumulate result
972 "vadd.s32 q0, q0, q14 \n"// accumulate result
973 "vadd.s32 q4, q4, q15 \n"// accumulate result
974
975 "subs %[count], %[count], #4 \n"// update loop counter
976 "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples
977
978 "bne 1b \n"// loop
979
980 ASSEMBLY_ACCUMULATE_STEREO
981
982 : [out] "=Uv" (out[0]),
983 [count] "+r" (count),
984 [coefsP0] "+r" (coefsP),
985 [coefsN0] "+r" (coefsN),
986 [sP] "+r" (sP),
987 [sN] "+r" (sN)
988 : [vLR] "r" (volumeLR)
989 : "cc", "memory",
990 "q0", "q1", "q2", "q3", "q4",
991 "q8", "q9", "q10", "q11",
992 "q12", "q13", "q14", "q15"
993 );
994}
995
996template <>
997inline void Process<1, 8>(int32_t* const out,
998 int count,
999 const int32_t* coefsP,
1000 const int32_t* coefsN,
1001 const int32_t* coefsP1,
1002 const int32_t* coefsN1,
1003 const int16_t* sP,
1004 const int16_t* sN,
1005 uint32_t lerpP,
1006 const int32_t* const volumeLR)
1007{
1008 const int CHANNELS = 1; // template specialization does not preserve params
1009 const int STRIDE = 8;
1010 sP -= CHANNELS*((STRIDE>>1)-1);
1011 asm (
1012 "vmov.32 d2[0], %[lerpP] \n"// load the positive phase
1013 "veor q0, q0, q0 \n"// result, initialize to 0
1014
1015 "1: \n"
1016
1017 "vld1.16 {d4}, [%[sP]] \n"// load 4 16-bits mono samples
1018 "vld1.16 {d6}, [%[sN]]! \n"// load 4 16-bits mono samples
1019 "vld1.32 {q8}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs
1020 "vld1.32 {q9}, [%[coefsP1]:128]! \n"// load 4 32-bits coefs for interpolation
1021 "vld1.32 {q10}, [%[coefsN1]:128]! \n"// load 4 32-bits coefs
1022 "vld1.32 {q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs for interpolation
1023
1024 "vrev64.16 d4, d4 \n"// reverse 2 frames of the positive side
1025
1026 "vsub.s32 q9, q9, q8 \n"// interpolate (step1) 1st set of coefs
1027 "vsub.s32 q11, q11, q10 \n"// interpolate (step1) 2nd set of coets
1028 "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
1029
1030 "vqrdmulh.s32 q9, q9, d2[0] \n"// interpolate (step2) 1st set of coefs
1031 "vqrdmulh.s32 q11, q11, d2[0] \n"// interpolate (step2) 2nd set of coefs
1032 "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits
1033
1034 "vadd.s32 q8, q8, q9 \n"// interpolate (step3) 1st set
1035 "vadd.s32 q10, q10, q11 \n"// interpolate (step4) 2nd set
1036
1037 "vqrdmulh.s32 q12, q12, q8 \n"// multiply samples by interpolated coef
1038 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
1039
1040 "vadd.s32 q0, q0, q12 \n"// accumulate result
1041 "vadd.s32 q0, q0, q14 \n"// accumulate result
1042
1043 "subs %[count], %[count], #4 \n"// update loop counter
1044 "sub %[sP], %[sP], #8 \n"// move pointer to next set of samples
1045
1046 "bne 1b \n"// loop
1047
1048 ASSEMBLY_ACCUMULATE_MONO
1049
1050 : [out] "=Uv" (out[0]),
1051 [count] "+r" (count),
1052 [coefsP0] "+r" (coefsP),
1053 [coefsP1] "+r" (coefsP1),
1054 [coefsN0] "+r" (coefsN),
1055 [coefsN1] "+r" (coefsN1),
1056 [sP] "+r" (sP),
1057 [sN] "+r" (sN)
1058 : [lerpP] "r" (lerpP),
1059 [vLR] "r" (volumeLR)
1060 : "cc", "memory",
1061 "q0", "q1", "q2", "q3",
1062 "q8", "q9", "q10", "q11",
1063 "q12", "q14"
1064 );
1065}
1066
1067template <>
1068inline
1069void Process<2, 8>(int32_t* const out,
1070 int count,
1071 const int32_t* coefsP,
1072 const int32_t* coefsN,
1073 const int32_t* coefsP1,
1074 const int32_t* coefsN1,
1075 const int16_t* sP,
1076 const int16_t* sN,
1077 uint32_t lerpP,
1078 const int32_t* const volumeLR)
1079{
1080 const int CHANNELS = 2; // template specialization does not preserve params
1081 const int STRIDE = 8;
1082 sP -= CHANNELS*((STRIDE>>1)-1);
1083 asm (
1084 "vmov.32 d2[0], %[lerpP] \n"// load the positive phase
1085 "veor q0, q0, q0 \n"// result, initialize to 0
1086 "veor q4, q4, q4 \n"// result, initialize to 0
1087
1088 "1: \n"
1089 "vld2.16 {d4, d5}, [%[sP]] \n"// load 4 16-bits stereo samples
1090 "vld2.16 {d6, d7}, [%[sN]]! \n"// load 4 16-bits stereo samples
1091 "vld1.32 {q8}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs
1092 "vld1.32 {q9}, [%[coefsP1]:128]! \n"// load 4 32-bits coefs for interpolation
1093 "vld1.32 {q10}, [%[coefsN1]:128]! \n"// load 4 32-bits coefs
1094 "vld1.32 {q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs for interpolation
1095
1096 "vrev64.16 q2, q2 \n"// (reversed) 2 frames of the positive side
1097
1098 "vsub.s32 q9, q9, q8 \n"// interpolate (step1) 1st set of coefs
1099 "vsub.s32 q11, q11, q10 \n"// interpolate (step1) 2nd set of coets
1100 "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
1101 "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
1102
1103 "vqrdmulh.s32 q9, q9, d2[0] \n"// interpolate (step2) 1st set of coefs
1104 "vqrdmulh.s32 q11, q11, d2[1] \n"// interpolate (step3) 2nd set of coefs
1105 "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits
1106 "vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits
1107
1108 "vadd.s32 q8, q8, q9 \n"// interpolate (step3) 1st set
1109 "vadd.s32 q10, q10, q11 \n"// interpolate (step4) 2nd set
1110
1111 "vqrdmulh.s32 q12, q12, q8 \n"// multiply samples by interpolated coef
1112 "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef
1113 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
1114 "vqrdmulh.s32 q15, q15, q10 \n"// multiply samples by interpolated coef
1115
1116 "vadd.s32 q0, q0, q12 \n"// accumulate result
1117 "vadd.s32 q4, q4, q13 \n"// accumulate result
1118 "vadd.s32 q0, q0, q14 \n"// accumulate result
1119 "vadd.s32 q4, q4, q15 \n"// accumulate result
1120
1121 "subs %[count], %[count], #4 \n"// update loop counter
1122 "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples
1123
1124 "bne 1b \n"// loop
1125
1126 ASSEMBLY_ACCUMULATE_STEREO
1127
1128 : [out] "=Uv" (out[0]),
1129 [count] "+r" (count),
1130 [coefsP0] "+r" (coefsP),
1131 [coefsP1] "+r" (coefsP1),
1132 [coefsN0] "+r" (coefsN),
1133 [coefsN1] "+r" (coefsN1),
1134 [sP] "+r" (sP),
1135 [sN] "+r" (sN)
1136 : [lerpP] "r" (lerpP),
1137 [vLR] "r" (volumeLR)
1138 : "cc", "memory",
1139 "q0", "q1", "q2", "q3", "q4",
1140 "q8", "q9", "q10", "q11",
1141 "q12", "q13", "q14", "q15"
1142 );
1143}
1144
1145#endif //USE_NEON
1146
1147}; // namespace android
1148
1149#endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H*/