blob: 107175b52536396c0db16222df8f0a8b44aee803 [file] [log] [blame]
Andy Hung86eae0e2013-12-09 12:12:46 -08001/*
2 * Copyright (C) 2013 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
18#define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
19
20namespace android {
21
22// depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h
23
24#if USE_NEON
25//
26// NEON specializations are enabled for Process() and ProcessL()
Andy Hung86eae0e2013-12-09 12:12:46 -080027
28// Macros to save a mono/stereo accumulator sample in q0 (and q4) as stereo out.
29#define ASSEMBLY_ACCUMULATE_MONO \
30 "vld1.s32 {d2}, [%[vLR]:64] \n"/* (1) load volumes */\
31 "vld1.s32 {d3}, %[out] \n"/* (2) unaligned load the output */\
32 "vpadd.s32 d0, d0, d1 \n"/* (1) add all 4 partial sums */\
33 "vpadd.s32 d0, d0, d0 \n"/* (1+4d) and replicate L/R */\
34 "vqrdmulh.s32 d0, d0, d2 \n"/* (2+3d) apply volume */\
35 "vqadd.s32 d3, d3, d0 \n"/* (1+4d) accumulate result (saturating) */\
36 "vst1.s32 {d3}, %[out] \n"/* (2+2d) store result */
37
38#define ASSEMBLY_ACCUMULATE_STEREO \
39 "vld1.s32 {d2}, [%[vLR]:64] \n"/* (1) load volumes*/\
40 "vld1.s32 {d3}, %[out] \n"/* (2) unaligned load the output*/\
41 "vpadd.s32 d0, d0, d1 \n"/* (1) add all 4 partial sums from q0*/\
42 "vpadd.s32 d8, d8, d9 \n"/* (1) add all 4 partial sums from q4*/\
43 "vpadd.s32 d0, d0, d8 \n"/* (1+4d) combine into L/R*/\
44 "vqrdmulh.s32 d0, d0, d2 \n"/* (2+3d) apply volume*/\
45 "vqadd.s32 d3, d3, d0 \n"/* (1+4d) accumulate result (saturating)*/\
46 "vst1.s32 {d3}, %[out] \n"/* (2+2d)store result*/
47
48template <>
49inline void ProcessL<1, 16>(int32_t* const out,
50 int count,
51 const int16_t* coefsP,
52 const int16_t* coefsN,
53 const int16_t* sP,
54 const int16_t* sN,
55 const int32_t* const volumeLR)
56{
57 const int CHANNELS = 1; // template specialization does not preserve params
58 const int STRIDE = 16;
59 sP -= CHANNELS*((STRIDE>>1)-1);
60 asm (
61 "veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0
62
63 "1: \n"
64
65 "vld1.16 {q2}, [%[sP]] \n"// (2+0d) load 8 16-bits mono samples
66 "vld1.16 {q3}, [%[sN]]! \n"// (2) load 8 16-bits mono samples
67 "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs
68 "vld1.16 {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
69
70 "vrev64.16 q2, q2 \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
71
72 // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
73 "vmlal.s16 q0, d4, d17 \n"// (1+0d) multiply (reversed)samples by coef
74 "vmlal.s16 q0, d5, d16 \n"// (1) multiply (reversed)samples by coef
75 "vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples
76 "vmlal.s16 q0, d7, d21 \n"// (1) multiply neg samples
77
78 // moving these ARM instructions before neon above seems to be slower
79 "subs %[count], %[count], #8 \n"// (1) update loop counter
80 "sub %[sP], %[sP], #16 \n"// (0) move pointer to next set of samples
81
82 // sP used after branch (warning)
83 "bne 1b \n"// loop
84
85 ASSEMBLY_ACCUMULATE_MONO
86
87 : [out] "=Uv" (out[0]),
88 [count] "+r" (count),
89 [coefsP0] "+r" (coefsP),
90 [coefsN0] "+r" (coefsN),
91 [sP] "+r" (sP),
92 [sN] "+r" (sN)
93 : [vLR] "r" (volumeLR)
94 : "cc", "memory",
95 "q0", "q1", "q2", "q3",
96 "q8", "q10"
97 );
98}
99
100template <>
101inline void ProcessL<2, 16>(int32_t* const out,
102 int count,
103 const int16_t* coefsP,
104 const int16_t* coefsN,
105 const int16_t* sP,
106 const int16_t* sN,
107 const int32_t* const volumeLR)
108{
109 const int CHANNELS = 2; // template specialization does not preserve params
110 const int STRIDE = 16;
111 sP -= CHANNELS*((STRIDE>>1)-1);
112 asm (
113 "veor q0, q0, q0 \n"// (1) acc_L = 0
114 "veor q4, q4, q4 \n"// (0 combines+) acc_R = 0
115
116 "1: \n"
117
Andy Hungd7a77152015-02-06 14:58:38 -0800118 "vld2.16 {q2, q3}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo frames
119 "vld2.16 {q5, q6}, [%[sN]]! \n"// (3) load 8 16-bits stereo frames
Andy Hung86eae0e2013-12-09 12:12:46 -0800120 "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs
121 "vld1.16 {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
122
Andy Hungd7a77152015-02-06 14:58:38 -0800123 "vrev64.16 q2, q2 \n"// (1) reverse 8 samples of positive left
124 "vrev64.16 q3, q3 \n"// (0 combines+) reverse positive right
Andy Hung86eae0e2013-12-09 12:12:46 -0800125
126 "vmlal.s16 q0, d4, d17 \n"// (1) multiply (reversed) samples left
127 "vmlal.s16 q0, d5, d16 \n"// (1) multiply (reversed) samples left
128 "vmlal.s16 q4, d6, d17 \n"// (1) multiply (reversed) samples right
129 "vmlal.s16 q4, d7, d16 \n"// (1) multiply (reversed) samples right
130 "vmlal.s16 q0, d10, d20 \n"// (1) multiply samples left
131 "vmlal.s16 q0, d11, d21 \n"// (1) multiply samples left
132 "vmlal.s16 q4, d12, d20 \n"// (1) multiply samples right
133 "vmlal.s16 q4, d13, d21 \n"// (1) multiply samples right
134
135 // moving these ARM before neon seems to be slower
136 "subs %[count], %[count], #8 \n"// (1) update loop counter
137 "sub %[sP], %[sP], #32 \n"// (0) move pointer to next set of samples
138
139 // sP used after branch (warning)
140 "bne 1b \n"// loop
141
142 ASSEMBLY_ACCUMULATE_STEREO
143
144 : [out] "=Uv" (out[0]),
145 [count] "+r" (count),
146 [coefsP0] "+r" (coefsP),
147 [coefsN0] "+r" (coefsN),
148 [sP] "+r" (sP),
149 [sN] "+r" (sN)
150 : [vLR] "r" (volumeLR)
151 : "cc", "memory",
152 "q0", "q1", "q2", "q3",
153 "q4", "q5", "q6",
154 "q8", "q10"
155 );
156}
157
158template <>
159inline void Process<1, 16>(int32_t* const out,
160 int count,
161 const int16_t* coefsP,
162 const int16_t* coefsN,
163 const int16_t* coefsP1,
164 const int16_t* coefsN1,
165 const int16_t* sP,
166 const int16_t* sN,
167 uint32_t lerpP,
168 const int32_t* const volumeLR)
169{
170 const int CHANNELS = 1; // template specialization does not preserve params
171 const int STRIDE = 16;
172 sP -= CHANNELS*((STRIDE>>1)-1);
173 asm (
174 "vmov.32 d2[0], %[lerpP] \n"// load the positive phase S32 Q15
175 "veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0
176
177 "1: \n"
178
179 "vld1.16 {q2}, [%[sP]] \n"// (2+0d) load 8 16-bits mono samples
180 "vld1.16 {q3}, [%[sN]]! \n"// (2) load 8 16-bits mono samples
181 "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs
182 "vld1.16 {q9}, [%[coefsP1]:128]! \n"// (1) load 8 16-bits coefs for interpolation
183 "vld1.16 {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
184 "vld1.16 {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation
185
186 "vsub.s16 q9, q9, q8 \n"// (1) interpolate (step1) 1st set of coefs
187 "vsub.s16 q11, q11, q10 \n"// (1) interpolate (step1) 2nd set of coets
188
189 "vqrdmulh.s16 q9, q9, d2[0] \n"// (2) interpolate (step2) 1st set of coefs
190 "vqrdmulh.s16 q11, q11, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs
191
192 "vrev64.16 q2, q2 \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
193
194 "vadd.s16 q8, q8, q9 \n"// (1+2d) interpolate (step3) 1st set
195 "vadd.s16 q10, q10, q11 \n"// (1+1d) interpolate (step3) 2nd set
196
197 // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
198 "vmlal.s16 q0, d4, d17 \n"// (1+0d) multiply reversed samples by coef
199 "vmlal.s16 q0, d5, d16 \n"// (1) multiply reversed samples by coef
200 "vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples
201 "vmlal.s16 q0, d7, d21 \n"// (1) multiply neg samples
202
203 // moving these ARM instructions before neon above seems to be slower
204 "subs %[count], %[count], #8 \n"// (1) update loop counter
205 "sub %[sP], %[sP], #16 \n"// (0) move pointer to next set of samples
206
207 // sP used after branch (warning)
208 "bne 1b \n"// loop
209
210 ASSEMBLY_ACCUMULATE_MONO
211
212 : [out] "=Uv" (out[0]),
213 [count] "+r" (count),
214 [coefsP0] "+r" (coefsP),
215 [coefsN0] "+r" (coefsN),
216 [coefsP1] "+r" (coefsP1),
217 [coefsN1] "+r" (coefsN1),
218 [sP] "+r" (sP),
219 [sN] "+r" (sN)
220 : [lerpP] "r" (lerpP),
221 [vLR] "r" (volumeLR)
222 : "cc", "memory",
223 "q0", "q1", "q2", "q3",
224 "q8", "q9", "q10", "q11"
225 );
226}
227
228template <>
229inline void Process<2, 16>(int32_t* const out,
230 int count,
231 const int16_t* coefsP,
232 const int16_t* coefsN,
233 const int16_t* coefsP1,
234 const int16_t* coefsN1,
235 const int16_t* sP,
236 const int16_t* sN,
237 uint32_t lerpP,
238 const int32_t* const volumeLR)
239{
240 const int CHANNELS = 2; // template specialization does not preserve params
241 const int STRIDE = 16;
242 sP -= CHANNELS*((STRIDE>>1)-1);
243 asm (
244 "vmov.32 d2[0], %[lerpP] \n"// load the positive phase
245 "veor q0, q0, q0 \n"// (1) acc_L = 0
246 "veor q4, q4, q4 \n"// (0 combines+) acc_R = 0
247
248 "1: \n"
249
Andy Hungd7a77152015-02-06 14:58:38 -0800250 "vld2.16 {q2, q3}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo frames
251 "vld2.16 {q5, q6}, [%[sN]]! \n"// (3) load 8 16-bits stereo frames
Andy Hung86eae0e2013-12-09 12:12:46 -0800252 "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs
253 "vld1.16 {q9}, [%[coefsP1]:128]! \n"// (1) load 8 16-bits coefs for interpolation
254 "vld1.16 {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
255 "vld1.16 {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation
256
257 "vsub.s16 q9, q9, q8 \n"// (1) interpolate (step1) 1st set of coefs
258 "vsub.s16 q11, q11, q10 \n"// (1) interpolate (step1) 2nd set of coets
259
260 "vqrdmulh.s16 q9, q9, d2[0] \n"// (2) interpolate (step2) 1st set of coefs
261 "vqrdmulh.s16 q11, q11, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs
262
Andy Hungd7a77152015-02-06 14:58:38 -0800263 "vrev64.16 q2, q2 \n"// (1) reverse 8 samples of positive left
264 "vrev64.16 q3, q3 \n"// (1) reverse 8 samples of positive right
Andy Hung86eae0e2013-12-09 12:12:46 -0800265
266 "vadd.s16 q8, q8, q9 \n"// (1+1d) interpolate (step3) 1st set
267 "vadd.s16 q10, q10, q11 \n"// (1+1d) interpolate (step3) 2nd set
268
269 "vmlal.s16 q0, d4, d17 \n"// (1) multiply reversed samples left
270 "vmlal.s16 q0, d5, d16 \n"// (1) multiply reversed samples left
271 "vmlal.s16 q4, d6, d17 \n"// (1) multiply reversed samples right
272 "vmlal.s16 q4, d7, d16 \n"// (1) multiply reversed samples right
273 "vmlal.s16 q0, d10, d20 \n"// (1) multiply samples left
274 "vmlal.s16 q0, d11, d21 \n"// (1) multiply samples left
275 "vmlal.s16 q4, d12, d20 \n"// (1) multiply samples right
276 "vmlal.s16 q4, d13, d21 \n"// (1) multiply samples right
277
278 // moving these ARM before neon seems to be slower
279 "subs %[count], %[count], #8 \n"// (1) update loop counter
280 "sub %[sP], %[sP], #32 \n"// (0) move pointer to next set of samples
281
282 // sP used after branch (warning)
283 "bne 1b \n"// loop
284
285 ASSEMBLY_ACCUMULATE_STEREO
286
287 : [out] "=Uv" (out[0]),
288 [count] "+r" (count),
289 [coefsP0] "+r" (coefsP),
290 [coefsN0] "+r" (coefsN),
291 [coefsP1] "+r" (coefsP1),
292 [coefsN1] "+r" (coefsN1),
293 [sP] "+r" (sP),
294 [sN] "+r" (sN)
295 : [lerpP] "r" (lerpP),
296 [vLR] "r" (volumeLR)
297 : "cc", "memory",
298 "q0", "q1", "q2", "q3",
299 "q4", "q5", "q6",
300 "q8", "q9", "q10", "q11"
301 );
302}
303
304template <>
305inline void ProcessL<1, 16>(int32_t* const out,
306 int count,
307 const int32_t* coefsP,
308 const int32_t* coefsN,
309 const int16_t* sP,
310 const int16_t* sN,
311 const int32_t* const volumeLR)
312{
313 const int CHANNELS = 1; // template specialization does not preserve params
314 const int STRIDE = 16;
315 sP -= CHANNELS*((STRIDE>>1)-1);
316 asm (
317 "veor q0, q0, q0 \n"// result, initialize to 0
318
319 "1: \n"
320
321 "vld1.16 {q2}, [%[sP]] \n"// load 8 16-bits mono samples
322 "vld1.16 {q3}, [%[sN]]! \n"// load 8 16-bits mono samples
323 "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs
324 "vld1.32 {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
325
Andy Hungd7a77152015-02-06 14:58:38 -0800326 "vrev64.16 q2, q2 \n"// reverse 8 samples of the positive side
Andy Hung86eae0e2013-12-09 12:12:46 -0800327
328 "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
329 "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
330
331 "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits
332 "vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits
333
Andy Hungd7a77152015-02-06 14:58:38 -0800334 "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples
335 "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples
336 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples
337 "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples
Andy Hung86eae0e2013-12-09 12:12:46 -0800338
339 "vadd.s32 q0, q0, q12 \n"// accumulate result
340 "vadd.s32 q13, q13, q14 \n"// accumulate result
341 "vadd.s32 q0, q0, q15 \n"// accumulate result
342 "vadd.s32 q0, q0, q13 \n"// accumulate result
343
344 "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples
345 "subs %[count], %[count], #8 \n"// update loop counter
346
347 "bne 1b \n"// loop
348
349 ASSEMBLY_ACCUMULATE_MONO
350
351 : [out] "=Uv" (out[0]),
352 [count] "+r" (count),
353 [coefsP0] "+r" (coefsP),
354 [coefsN0] "+r" (coefsN),
355 [sP] "+r" (sP),
356 [sN] "+r" (sN)
357 : [vLR] "r" (volumeLR)
358 : "cc", "memory",
359 "q0", "q1", "q2", "q3",
360 "q8", "q9", "q10", "q11",
361 "q12", "q13", "q14", "q15"
362 );
363}
364
365template <>
366inline void ProcessL<2, 16>(int32_t* const out,
367 int count,
368 const int32_t* coefsP,
369 const int32_t* coefsN,
370 const int16_t* sP,
371 const int16_t* sN,
372 const int32_t* const volumeLR)
373{
374 const int CHANNELS = 2; // template specialization does not preserve params
375 const int STRIDE = 16;
376 sP -= CHANNELS*((STRIDE>>1)-1);
377 asm (
378 "veor q0, q0, q0 \n"// result, initialize to 0
379 "veor q4, q4, q4 \n"// result, initialize to 0
380
381 "1: \n"
382
Andy Hungd7a77152015-02-06 14:58:38 -0800383 "vld2.16 {q2, q3}, [%[sP]] \n"// load 8 16-bits stereo frames
384 "vld2.16 {q5, q6}, [%[sN]]! \n"// load 8 16-bits stereo frames
385 "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs
386 "vld1.32 {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
Andy Hung86eae0e2013-12-09 12:12:46 -0800387
Andy Hungd7a77152015-02-06 14:58:38 -0800388 "vrev64.16 q2, q2 \n"// reverse 8 samples of positive left
389 "vrev64.16 q3, q3 \n"// reverse 8 samples of positive right
Andy Hung86eae0e2013-12-09 12:12:46 -0800390
391 "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
392 "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
393
394 "vshll.s16 q14, d10, #15 \n"// extend samples to 31 bits
395 "vshll.s16 q15, d11, #15 \n"// extend samples to 31 bits
396
Andy Hungd7a77152015-02-06 14:58:38 -0800397 "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by coef
398 "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by coef
399 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by coef
400 "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by coef
Andy Hung86eae0e2013-12-09 12:12:46 -0800401
402 "vadd.s32 q0, q0, q12 \n"// accumulate result
403 "vadd.s32 q13, q13, q14 \n"// accumulate result
Andy Hungd7a77152015-02-06 14:58:38 -0800404 "vadd.s32 q0, q0, q15 \n"// accumulate result
405 "vadd.s32 q0, q0, q13 \n"// accumulate result
Andy Hung86eae0e2013-12-09 12:12:46 -0800406
407 "vshll.s16 q12, d6, #15 \n"// extend samples to 31 bits
408 "vshll.s16 q13, d7, #15 \n"// extend samples to 31 bits
409
410 "vshll.s16 q14, d12, #15 \n"// extend samples to 31 bits
411 "vshll.s16 q15, d13, #15 \n"// extend samples to 31 bits
412
Andy Hungd7a77152015-02-06 14:58:38 -0800413 "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by coef
414 "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by coef
415 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by coef
416 "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by coef
Andy Hung86eae0e2013-12-09 12:12:46 -0800417
418 "vadd.s32 q4, q4, q12 \n"// accumulate result
419 "vadd.s32 q13, q13, q14 \n"// accumulate result
Andy Hungd7a77152015-02-06 14:58:38 -0800420 "vadd.s32 q4, q4, q15 \n"// accumulate result
421 "vadd.s32 q4, q4, q13 \n"// accumulate result
Andy Hung86eae0e2013-12-09 12:12:46 -0800422
423 "subs %[count], %[count], #8 \n"// update loop counter
424 "sub %[sP], %[sP], #32 \n"// move pointer to next set of samples
425
426 "bne 1b \n"// loop
427
428 ASSEMBLY_ACCUMULATE_STEREO
429
430 : [out] "=Uv" (out[0]),
431 [count] "+r" (count),
432 [coefsP0] "+r" (coefsP),
433 [coefsN0] "+r" (coefsN),
434 [sP] "+r" (sP),
435 [sN] "+r" (sN)
436 : [vLR] "r" (volumeLR)
437 : "cc", "memory",
438 "q0", "q1", "q2", "q3",
439 "q4", "q5", "q6",
440 "q8", "q9", "q10", "q11",
441 "q12", "q13", "q14", "q15"
442 );
443}
444
445template <>
446inline void Process<1, 16>(int32_t* const out,
447 int count,
448 const int32_t* coefsP,
449 const int32_t* coefsN,
450 const int32_t* coefsP1,
451 const int32_t* coefsN1,
452 const int16_t* sP,
453 const int16_t* sN,
454 uint32_t lerpP,
455 const int32_t* const volumeLR)
456{
457 const int CHANNELS = 1; // template specialization does not preserve params
458 const int STRIDE = 16;
459 sP -= CHANNELS*((STRIDE>>1)-1);
460 asm (
461 "vmov.32 d2[0], %[lerpP] \n"// load the positive phase
462 "veor q0, q0, q0 \n"// result, initialize to 0
463
464 "1: \n"
465
466 "vld1.16 {q2}, [%[sP]] \n"// load 8 16-bits mono samples
467 "vld1.16 {q3}, [%[sN]]! \n"// load 8 16-bits mono samples
468 "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs
469 "vld1.32 {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
470 "vld1.32 {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
471 "vld1.32 {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
472
473 "vsub.s32 q12, q12, q8 \n"// interpolate (step1)
474 "vsub.s32 q13, q13, q9 \n"// interpolate (step1)
475 "vsub.s32 q14, q14, q10 \n"// interpolate (step1)
476 "vsub.s32 q15, q15, q11 \n"// interpolate (step1)
477
478 "vqrdmulh.s32 q12, q12, d2[0] \n"// interpolate (step2)
479 "vqrdmulh.s32 q13, q13, d2[0] \n"// interpolate (step2)
480 "vqrdmulh.s32 q14, q14, d2[0] \n"// interpolate (step2)
481 "vqrdmulh.s32 q15, q15, d2[0] \n"// interpolate (step2)
482
483 "vadd.s32 q8, q8, q12 \n"// interpolate (step3)
484 "vadd.s32 q9, q9, q13 \n"// interpolate (step3)
485 "vadd.s32 q10, q10, q14 \n"// interpolate (step3)
486 "vadd.s32 q11, q11, q15 \n"// interpolate (step3)
487
Andy Hungd7a77152015-02-06 14:58:38 -0800488 "vrev64.16 q2, q2 \n"// reverse 8 samples of the positive side
Andy Hung86eae0e2013-12-09 12:12:46 -0800489
490 "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
491 "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
492
493 "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits
494 "vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits
495
496 "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef
497 "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef
498 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
499 "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef
500
501 "vadd.s32 q0, q0, q12 \n"// accumulate result
502 "vadd.s32 q13, q13, q14 \n"// accumulate result
503 "vadd.s32 q0, q0, q15 \n"// accumulate result
504 "vadd.s32 q0, q0, q13 \n"// accumulate result
505
506 "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples
507 "subs %[count], %[count], #8 \n"// update loop counter
508
509 "bne 1b \n"// loop
510
511 ASSEMBLY_ACCUMULATE_MONO
512
513 : [out] "=Uv" (out[0]),
514 [count] "+r" (count),
515 [coefsP0] "+r" (coefsP),
516 [coefsN0] "+r" (coefsN),
517 [coefsP1] "+r" (coefsP1),
518 [coefsN1] "+r" (coefsN1),
519 [sP] "+r" (sP),
520 [sN] "+r" (sN)
521 : [lerpP] "r" (lerpP),
522 [vLR] "r" (volumeLR)
523 : "cc", "memory",
524 "q0", "q1", "q2", "q3",
525 "q8", "q9", "q10", "q11",
526 "q12", "q13", "q14", "q15"
527 );
528}
529
530template <>
531inline void Process<2, 16>(int32_t* const out,
532 int count,
533 const int32_t* coefsP,
534 const int32_t* coefsN,
535 const int32_t* coefsP1,
536 const int32_t* coefsN1,
537 const int16_t* sP,
538 const int16_t* sN,
539 uint32_t lerpP,
540 const int32_t* const volumeLR)
541{
542 const int CHANNELS = 2; // template specialization does not preserve params
543 const int STRIDE = 16;
544 sP -= CHANNELS*((STRIDE>>1)-1);
545 asm (
546 "vmov.32 d2[0], %[lerpP] \n"// load the positive phase
547 "veor q0, q0, q0 \n"// result, initialize to 0
548 "veor q4, q4, q4 \n"// result, initialize to 0
549
550 "1: \n"
551
Andy Hungd7a77152015-02-06 14:58:38 -0800552 "vld2.16 {q2, q3}, [%[sP]] \n"// load 8 16-bits stereo frames
553 "vld2.16 {q5, q6}, [%[sN]]! \n"// load 8 16-bits stereo frames
Andy Hung86eae0e2013-12-09 12:12:46 -0800554 "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs
555 "vld1.32 {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
556 "vld1.32 {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
557 "vld1.32 {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
558
559 "vsub.s32 q12, q12, q8 \n"// interpolate (step1)
560 "vsub.s32 q13, q13, q9 \n"// interpolate (step1)
561 "vsub.s32 q14, q14, q10 \n"// interpolate (step1)
562 "vsub.s32 q15, q15, q11 \n"// interpolate (step1)
563
564 "vqrdmulh.s32 q12, q12, d2[0] \n"// interpolate (step2)
565 "vqrdmulh.s32 q13, q13, d2[0] \n"// interpolate (step2)
566 "vqrdmulh.s32 q14, q14, d2[0] \n"// interpolate (step2)
567 "vqrdmulh.s32 q15, q15, d2[0] \n"// interpolate (step2)
568
569 "vadd.s32 q8, q8, q12 \n"// interpolate (step3)
570 "vadd.s32 q9, q9, q13 \n"// interpolate (step3)
571 "vadd.s32 q10, q10, q14 \n"// interpolate (step3)
572 "vadd.s32 q11, q11, q15 \n"// interpolate (step3)
573
Andy Hungd7a77152015-02-06 14:58:38 -0800574 "vrev64.16 q2, q2 \n"// reverse 8 samples of positive left
575 "vrev64.16 q3, q3 \n"// reverse 8 samples of positive right
Andy Hung86eae0e2013-12-09 12:12:46 -0800576
577 "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
578 "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
579
580 "vshll.s16 q14, d10, #15 \n"// extend samples to 31 bits
581 "vshll.s16 q15, d11, #15 \n"// extend samples to 31 bits
582
583 "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef
584 "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef
585 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
586 "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef
587
588 "vadd.s32 q0, q0, q12 \n"// accumulate result
589 "vadd.s32 q13, q13, q14 \n"// accumulate result
Andy Hungd7a77152015-02-06 14:58:38 -0800590 "vadd.s32 q0, q0, q15 \n"// accumulate result
591 "vadd.s32 q0, q0, q13 \n"// accumulate result
Andy Hung86eae0e2013-12-09 12:12:46 -0800592
593 "vshll.s16 q12, d6, #15 \n"// extend samples to 31 bits
594 "vshll.s16 q13, d7, #15 \n"// extend samples to 31 bits
595
596 "vshll.s16 q14, d12, #15 \n"// extend samples to 31 bits
597 "vshll.s16 q15, d13, #15 \n"// extend samples to 31 bits
598
599 "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef
600 "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef
601 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
602 "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef
603
604 "vadd.s32 q4, q4, q12 \n"// accumulate result
605 "vadd.s32 q13, q13, q14 \n"// accumulate result
Andy Hungd7a77152015-02-06 14:58:38 -0800606 "vadd.s32 q4, q4, q15 \n"// accumulate result
607 "vadd.s32 q4, q4, q13 \n"// accumulate result
Andy Hung86eae0e2013-12-09 12:12:46 -0800608
609 "subs %[count], %[count], #8 \n"// update loop counter
610 "sub %[sP], %[sP], #32 \n"// move pointer to next set of samples
611
612 "bne 1b \n"// loop
613
614 ASSEMBLY_ACCUMULATE_STEREO
615
616 : [out] "=Uv" (out[0]),
617 [count] "+r" (count),
618 [coefsP0] "+r" (coefsP),
619 [coefsN0] "+r" (coefsN),
620 [coefsP1] "+r" (coefsP1),
621 [coefsN1] "+r" (coefsN1),
622 [sP] "+r" (sP),
623 [sN] "+r" (sN)
624 : [lerpP] "r" (lerpP),
625 [vLR] "r" (volumeLR)
626 : "cc", "memory",
627 "q0", "q1", "q2", "q3",
628 "q4", "q5", "q6",
629 "q8", "q9", "q10", "q11",
630 "q12", "q13", "q14", "q15"
631 );
632}
633
Andy Hung86eae0e2013-12-09 12:12:46 -0800634#endif //USE_NEON
635
Glenn Kasten63238ef2015-03-02 15:50:29 -0800636} // namespace android
Andy Hung86eae0e2013-12-09 12:12:46 -0800637
638#endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H*/