Blame - services/audioflinger/AudioResamplerFirProcessNeon.h - android_frameworks_av

blob: f311cef8a92679abe0d8a2f67599298b86cf8ad8 [file] [log] [blame]

Andy Hung	86eae0e	2013-12-09 12:12:46 -0800	[diff] [blame]	1	/*
				2	* Copyright (C) 2013 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
				17	#ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
				18	#define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
				19
				20	namespace android {
				21
				22	// depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h
				23
				24	#if USE_NEON
				25	//
				26	// NEON specializations are enabled for Process() and ProcessL()
				27	//
				28	// TODO: Stride 16 and Stride 8 can be combined with one pass stride 8 (if necessary)
				29	// and looping stride 16 (or vice versa). This has some polyphase coef data alignment
				30	// issues with S16 coefs. Consider this later.
				31
				32	// Macros to save a mono/stereo accumulator sample in q0 (and q4) as stereo out.
				33	#define ASSEMBLY_ACCUMULATE_MONO \
				34	"vld1.s32 {d2}, [%[vLR]:64] \n"/* (1) load volumes */\
				35	"vld1.s32 {d3}, %[out] \n"/* (2) unaligned load the output */\
				36	"vpadd.s32 d0, d0, d1 \n"/* (1) add all 4 partial sums */\
				37	"vpadd.s32 d0, d0, d0 \n"/* (1+4d) and replicate L/R */\
				38	"vqrdmulh.s32 d0, d0, d2 \n"/* (2+3d) apply volume */\
				39	"vqadd.s32 d3, d3, d0 \n"/* (1+4d) accumulate result (saturating) */\
				40	"vst1.s32 {d3}, %[out] \n"/* (2+2d) store result */
				41
				42	#define ASSEMBLY_ACCUMULATE_STEREO \
				43	"vld1.s32 {d2}, [%[vLR]:64] \n"/* (1) load volumes*/\
				44	"vld1.s32 {d3}, %[out] \n"/* (2) unaligned load the output*/\
				45	"vpadd.s32 d0, d0, d1 \n"/* (1) add all 4 partial sums from q0*/\
				46	"vpadd.s32 d8, d8, d9 \n"/* (1) add all 4 partial sums from q4*/\
				47	"vpadd.s32 d0, d0, d8 \n"/* (1+4d) combine into L/R*/\
				48	"vqrdmulh.s32 d0, d0, d2 \n"/* (2+3d) apply volume*/\
				49	"vqadd.s32 d3, d3, d0 \n"/* (1+4d) accumulate result (saturating)*/\
				50	"vst1.s32 {d3}, %[out] \n"/* (2+2d)store result*/
				51
				52	template <>
				53	inline void ProcessL<1, 16>(int32_t* const out,
				54	int count,
				55	const int16_t* coefsP,
				56	const int16_t* coefsN,
				57	const int16_t* sP,
				58	const int16_t* sN,
				59	const int32_t* const volumeLR)
				60	{
				61	const int CHANNELS = 1; // template specialization does not preserve params
				62	const int STRIDE = 16;
				63	sP -= CHANNELS*((STRIDE>>1)-1);
				64	asm (
				65	"veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0
				66
				67	"1: \n"
				68
				69	"vld1.16 {q2}, [%[sP]] \n"// (2+0d) load 8 16-bits mono samples
				70	"vld1.16 {q3}, [%[sN]]! \n"// (2) load 8 16-bits mono samples
				71	"vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs
				72	"vld1.16 {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
				73
				74	"vrev64.16 q2, q2 \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
				75
				76	// reordering the vmal to do d6, d7 before d4, d5 is slower(?)
				77	"vmlal.s16 q0, d4, d17 \n"// (1+0d) multiply (reversed)samples by coef
				78	"vmlal.s16 q0, d5, d16 \n"// (1) multiply (reversed)samples by coef
				79	"vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples
				80	"vmlal.s16 q0, d7, d21 \n"// (1) multiply neg samples
				81
				82	// moving these ARM instructions before neon above seems to be slower
				83	"subs %[count], %[count], #8 \n"// (1) update loop counter
				84	"sub %[sP], %[sP], #16 \n"// (0) move pointer to next set of samples
				85
				86	// sP used after branch (warning)
				87	"bne 1b \n"// loop
				88
				89	ASSEMBLY_ACCUMULATE_MONO
				90
				91	: [out] "=Uv" (out[0]),
				92	[count] "+r" (count),
				93	[coefsP0] "+r" (coefsP),
				94	[coefsN0] "+r" (coefsN),
				95	[sP] "+r" (sP),
				96	[sN] "+r" (sN)
				97	: [vLR] "r" (volumeLR)
				98	: "cc", "memory",
				99	"q0", "q1", "q2", "q3",
				100	"q8", "q10"
				101	);
				102	}
				103
				104	template <>
				105	inline void ProcessL<2, 16>(int32_t* const out,
				106	int count,
				107	const int16_t* coefsP,
				108	const int16_t* coefsN,
				109	const int16_t* sP,
				110	const int16_t* sN,
				111	const int32_t* const volumeLR)
				112	{
				113	const int CHANNELS = 2; // template specialization does not preserve params
				114	const int STRIDE = 16;
				115	sP -= CHANNELS*((STRIDE>>1)-1);
				116	asm (
				117	"veor q0, q0, q0 \n"// (1) acc_L = 0
				118	"veor q4, q4, q4 \n"// (0 combines+) acc_R = 0
				119
				120	"1: \n"
				121
				122	"vld2.16 {q2, q3}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo samples
				123	"vld2.16 {q5, q6}, [%[sN]]! \n"// (3) load 8 16-bits stereo samples
				124	"vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs
				125	"vld1.16 {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
				126
				127	"vrev64.16 q2, q2 \n"// (1) reverse 8 frames of the left positive
				128	"vrev64.16 q3, q3 \n"// (0 combines+) reverse right positive
				129
				130	"vmlal.s16 q0, d4, d17 \n"// (1) multiply (reversed) samples left
				131	"vmlal.s16 q0, d5, d16 \n"// (1) multiply (reversed) samples left
				132	"vmlal.s16 q4, d6, d17 \n"// (1) multiply (reversed) samples right
				133	"vmlal.s16 q4, d7, d16 \n"// (1) multiply (reversed) samples right
				134	"vmlal.s16 q0, d10, d20 \n"// (1) multiply samples left
				135	"vmlal.s16 q0, d11, d21 \n"// (1) multiply samples left
				136	"vmlal.s16 q4, d12, d20 \n"// (1) multiply samples right
				137	"vmlal.s16 q4, d13, d21 \n"// (1) multiply samples right
				138
				139	// moving these ARM before neon seems to be slower
				140	"subs %[count], %[count], #8 \n"// (1) update loop counter
				141	"sub %[sP], %[sP], #32 \n"// (0) move pointer to next set of samples
				142
				143	// sP used after branch (warning)
				144	"bne 1b \n"// loop
				145
				146	ASSEMBLY_ACCUMULATE_STEREO
				147
				148	: [out] "=Uv" (out[0]),
				149	[count] "+r" (count),
				150	[coefsP0] "+r" (coefsP),
				151	[coefsN0] "+r" (coefsN),
				152	[sP] "+r" (sP),
				153	[sN] "+r" (sN)
				154	: [vLR] "r" (volumeLR)
				155	: "cc", "memory",
				156	"q0", "q1", "q2", "q3",
				157	"q4", "q5", "q6",
				158	"q8", "q10"
				159	);
				160	}
				161
				162	template <>
				163	inline void Process<1, 16>(int32_t* const out,
				164	int count,
				165	const int16_t* coefsP,
				166	const int16_t* coefsN,
				167	const int16_t* coefsP1,
				168	const int16_t* coefsN1,
				169	const int16_t* sP,
				170	const int16_t* sN,
				171	uint32_t lerpP,
				172	const int32_t* const volumeLR)
				173	{
				174	const int CHANNELS = 1; // template specialization does not preserve params
				175	const int STRIDE = 16;
				176	sP -= CHANNELS*((STRIDE>>1)-1);
				177	asm (
				178	"vmov.32 d2[0], %[lerpP] \n"// load the positive phase S32 Q15
				179	"veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0
				180
				181	"1: \n"
				182
				183	"vld1.16 {q2}, [%[sP]] \n"// (2+0d) load 8 16-bits mono samples
				184	"vld1.16 {q3}, [%[sN]]! \n"// (2) load 8 16-bits mono samples
				185	"vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs
				186	"vld1.16 {q9}, [%[coefsP1]:128]! \n"// (1) load 8 16-bits coefs for interpolation
				187	"vld1.16 {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
				188	"vld1.16 {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation
				189
				190	"vsub.s16 q9, q9, q8 \n"// (1) interpolate (step1) 1st set of coefs
				191	"vsub.s16 q11, q11, q10 \n"// (1) interpolate (step1) 2nd set of coets
				192
				193	"vqrdmulh.s16 q9, q9, d2[0] \n"// (2) interpolate (step2) 1st set of coefs
				194	"vqrdmulh.s16 q11, q11, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs
				195
				196	"vrev64.16 q2, q2 \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
				197
				198	"vadd.s16 q8, q8, q9 \n"// (1+2d) interpolate (step3) 1st set
				199	"vadd.s16 q10, q10, q11 \n"// (1+1d) interpolate (step3) 2nd set
				200
				201	// reordering the vmal to do d6, d7 before d4, d5 is slower(?)
				202	"vmlal.s16 q0, d4, d17 \n"// (1+0d) multiply reversed samples by coef
				203	"vmlal.s16 q0, d5, d16 \n"// (1) multiply reversed samples by coef
				204	"vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples
				205	"vmlal.s16 q0, d7, d21 \n"// (1) multiply neg samples
				206
				207	// moving these ARM instructions before neon above seems to be slower
				208	"subs %[count], %[count], #8 \n"// (1) update loop counter
				209	"sub %[sP], %[sP], #16 \n"// (0) move pointer to next set of samples
				210
				211	// sP used after branch (warning)
				212	"bne 1b \n"// loop
				213
				214	ASSEMBLY_ACCUMULATE_MONO
				215
				216	: [out] "=Uv" (out[0]),
				217	[count] "+r" (count),
				218	[coefsP0] "+r" (coefsP),
				219	[coefsN0] "+r" (coefsN),
				220	[coefsP1] "+r" (coefsP1),
				221	[coefsN1] "+r" (coefsN1),
				222	[sP] "+r" (sP),
				223	[sN] "+r" (sN)
				224	: [lerpP] "r" (lerpP),
				225	[vLR] "r" (volumeLR)
				226	: "cc", "memory",
				227	"q0", "q1", "q2", "q3",
				228	"q8", "q9", "q10", "q11"
				229	);
				230	}
				231
				232	template <>
				233	inline void Process<2, 16>(int32_t* const out,
				234	int count,
				235	const int16_t* coefsP,
				236	const int16_t* coefsN,
				237	const int16_t* coefsP1,
				238	const int16_t* coefsN1,
				239	const int16_t* sP,
				240	const int16_t* sN,
				241	uint32_t lerpP,
				242	const int32_t* const volumeLR)
				243	{
				244	const int CHANNELS = 2; // template specialization does not preserve params
				245	const int STRIDE = 16;
				246	sP -= CHANNELS*((STRIDE>>1)-1);
				247	asm (
				248	"vmov.32 d2[0], %[lerpP] \n"// load the positive phase
				249	"veor q0, q0, q0 \n"// (1) acc_L = 0
				250	"veor q4, q4, q4 \n"// (0 combines+) acc_R = 0
				251
				252	"1: \n"
				253
				254	"vld2.16 {q2, q3}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo samples
				255	"vld2.16 {q5, q6}, [%[sN]]! \n"// (3) load 8 16-bits stereo samples
				256	"vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs
				257	"vld1.16 {q9}, [%[coefsP1]:128]! \n"// (1) load 8 16-bits coefs for interpolation
				258	"vld1.16 {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
				259	"vld1.16 {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation
				260
				261	"vsub.s16 q9, q9, q8 \n"// (1) interpolate (step1) 1st set of coefs
				262	"vsub.s16 q11, q11, q10 \n"// (1) interpolate (step1) 2nd set of coets
				263
				264	"vqrdmulh.s16 q9, q9, d2[0] \n"// (2) interpolate (step2) 1st set of coefs
				265	"vqrdmulh.s16 q11, q11, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs
				266
				267	"vrev64.16 q2, q2 \n"// (1) reverse 8 frames of the left positive
				268	"vrev64.16 q3, q3 \n"// (1) reverse 8 frames of the right positive
				269
				270	"vadd.s16 q8, q8, q9 \n"// (1+1d) interpolate (step3) 1st set
				271	"vadd.s16 q10, q10, q11 \n"// (1+1d) interpolate (step3) 2nd set
				272
				273	"vmlal.s16 q0, d4, d17 \n"// (1) multiply reversed samples left
				274	"vmlal.s16 q0, d5, d16 \n"// (1) multiply reversed samples left
				275	"vmlal.s16 q4, d6, d17 \n"// (1) multiply reversed samples right
				276	"vmlal.s16 q4, d7, d16 \n"// (1) multiply reversed samples right
				277	"vmlal.s16 q0, d10, d20 \n"// (1) multiply samples left
				278	"vmlal.s16 q0, d11, d21 \n"// (1) multiply samples left
				279	"vmlal.s16 q4, d12, d20 \n"// (1) multiply samples right
				280	"vmlal.s16 q4, d13, d21 \n"// (1) multiply samples right
				281
				282	// moving these ARM before neon seems to be slower
				283	"subs %[count], %[count], #8 \n"// (1) update loop counter
				284	"sub %[sP], %[sP], #32 \n"// (0) move pointer to next set of samples
				285
				286	// sP used after branch (warning)
				287	"bne 1b \n"// loop
				288
				289	ASSEMBLY_ACCUMULATE_STEREO
				290
				291	: [out] "=Uv" (out[0]),
				292	[count] "+r" (count),
				293	[coefsP0] "+r" (coefsP),
				294	[coefsN0] "+r" (coefsN),
				295	[coefsP1] "+r" (coefsP1),
				296	[coefsN1] "+r" (coefsN1),
				297	[sP] "+r" (sP),
				298	[sN] "+r" (sN)
				299	: [lerpP] "r" (lerpP),
				300	[vLR] "r" (volumeLR)
				301	: "cc", "memory",
				302	"q0", "q1", "q2", "q3",
				303	"q4", "q5", "q6",
				304	"q8", "q9", "q10", "q11"
				305	);
				306	}
				307
				308	template <>
				309	inline void ProcessL<1, 16>(int32_t* const out,
				310	int count,
				311	const int32_t* coefsP,
				312	const int32_t* coefsN,
				313	const int16_t* sP,
				314	const int16_t* sN,
				315	const int32_t* const volumeLR)
				316	{
				317	const int CHANNELS = 1; // template specialization does not preserve params
				318	const int STRIDE = 16;
				319	sP -= CHANNELS*((STRIDE>>1)-1);
				320	asm (
				321	"veor q0, q0, q0 \n"// result, initialize to 0
				322
				323	"1: \n"
				324
				325	"vld1.16 {q2}, [%[sP]] \n"// load 8 16-bits mono samples
				326	"vld1.16 {q3}, [%[sN]]! \n"// load 8 16-bits mono samples
				327	"vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs
				328	"vld1.32 {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
				329
				330	"vrev64.16 q2, q2 \n"// reverse 8 frames of the positive side
				331
				332	"vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
				333	"vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
				334
				335	"vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits
				336	"vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits
				337
				338	"vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef
				339	"vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef
				340	"vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
				341	"vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef
				342
				343	"vadd.s32 q0, q0, q12 \n"// accumulate result
				344	"vadd.s32 q13, q13, q14 \n"// accumulate result
				345	"vadd.s32 q0, q0, q15 \n"// accumulate result
				346	"vadd.s32 q0, q0, q13 \n"// accumulate result
				347
				348	"sub %[sP], %[sP], #16 \n"// move pointer to next set of samples
				349	"subs %[count], %[count], #8 \n"// update loop counter
				350
				351	"bne 1b \n"// loop
				352
				353	ASSEMBLY_ACCUMULATE_MONO
				354
				355	: [out] "=Uv" (out[0]),
				356	[count] "+r" (count),
				357	[coefsP0] "+r" (coefsP),
				358	[coefsN0] "+r" (coefsN),
				359	[sP] "+r" (sP),
				360	[sN] "+r" (sN)
				361	: [vLR] "r" (volumeLR)
				362	: "cc", "memory",
				363	"q0", "q1", "q2", "q3",
				364	"q8", "q9", "q10", "q11",
				365	"q12", "q13", "q14", "q15"
				366	);
				367	}
				368
				369	template <>
				370	inline void ProcessL<2, 16>(int32_t* const out,
				371	int count,
				372	const int32_t* coefsP,
				373	const int32_t* coefsN,
				374	const int16_t* sP,
				375	const int16_t* sN,
				376	const int32_t* const volumeLR)
				377	{
				378	const int CHANNELS = 2; // template specialization does not preserve params
				379	const int STRIDE = 16;
				380	sP -= CHANNELS*((STRIDE>>1)-1);
				381	asm (
				382	"veor q0, q0, q0 \n"// result, initialize to 0
				383	"veor q4, q4, q4 \n"// result, initialize to 0
				384
				385	"1: \n"
				386
				387	"vld2.16 {q2, q3}, [%[sP]] \n"// load 4 16-bits stereo samples
				388	"vld2.16 {q5, q6}, [%[sN]]! \n"// load 4 16-bits stereo samples
				389	"vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs
				390	"vld1.32 {q10, q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs
				391
				392	"vrev64.16 q2, q2 \n"// reverse 8 frames of the positive side
				393	"vrev64.16 q3, q3 \n"// reverse 8 frames of the positive side
				394
				395	"vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
				396	"vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
				397
				398	"vshll.s16 q14, d10, #15 \n"// extend samples to 31 bits
				399	"vshll.s16 q15, d11, #15 \n"// extend samples to 31 bits
				400
				401	"vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef
				402	"vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef
				403	"vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
				404	"vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef
				405
				406	"vadd.s32 q0, q0, q12 \n"// accumulate result
				407	"vadd.s32 q13, q13, q14 \n"// accumulate result
				408	"vadd.s32 q0, q0, q15 \n"// (+1) accumulate result
				409	"vadd.s32 q0, q0, q13 \n"// (+1) accumulate result
				410
				411	"vshll.s16 q12, d6, #15 \n"// extend samples to 31 bits
				412	"vshll.s16 q13, d7, #15 \n"// extend samples to 31 bits
				413
				414	"vshll.s16 q14, d12, #15 \n"// extend samples to 31 bits
				415	"vshll.s16 q15, d13, #15 \n"// extend samples to 31 bits
				416
				417	"vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef
				418	"vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef
				419	"vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
				420	"vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef
				421
				422	"vadd.s32 q4, q4, q12 \n"// accumulate result
				423	"vadd.s32 q13, q13, q14 \n"// accumulate result
				424	"vadd.s32 q4, q4, q15 \n"// (+1) accumulate result
				425	"vadd.s32 q4, q4, q13 \n"// (+1) accumulate result
				426
				427	"subs %[count], %[count], #8 \n"// update loop counter
				428	"sub %[sP], %[sP], #32 \n"// move pointer to next set of samples
				429
				430	"bne 1b \n"// loop
				431
				432	ASSEMBLY_ACCUMULATE_STEREO
				433
				434	: [out] "=Uv" (out[0]),
				435	[count] "+r" (count),
				436	[coefsP0] "+r" (coefsP),
				437	[coefsN0] "+r" (coefsN),
				438	[sP] "+r" (sP),
				439	[sN] "+r" (sN)
				440	: [vLR] "r" (volumeLR)
				441	: "cc", "memory",
				442	"q0", "q1", "q2", "q3",
				443	"q4", "q5", "q6",
				444	"q8", "q9", "q10", "q11",
				445	"q12", "q13", "q14", "q15"
				446	);
				447	}
				448
				449	template <>
				450	inline void Process<1, 16>(int32_t* const out,
				451	int count,
				452	const int32_t* coefsP,
				453	const int32_t* coefsN,
				454	const int32_t* coefsP1,
				455	const int32_t* coefsN1,
				456	const int16_t* sP,
				457	const int16_t* sN,
				458	uint32_t lerpP,
				459	const int32_t* const volumeLR)
				460	{
				461	const int CHANNELS = 1; // template specialization does not preserve params
				462	const int STRIDE = 16;
				463	sP -= CHANNELS*((STRIDE>>1)-1);
				464	asm (
				465	"vmov.32 d2[0], %[lerpP] \n"// load the positive phase
				466	"veor q0, q0, q0 \n"// result, initialize to 0
				467
				468	"1: \n"
				469
				470	"vld1.16 {q2}, [%[sP]] \n"// load 8 16-bits mono samples
				471	"vld1.16 {q3}, [%[sN]]! \n"// load 8 16-bits mono samples
				472	"vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs
				473	"vld1.32 {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
				474	"vld1.32 {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
				475	"vld1.32 {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
				476
				477	"vsub.s32 q12, q12, q8 \n"// interpolate (step1)
				478	"vsub.s32 q13, q13, q9 \n"// interpolate (step1)
				479	"vsub.s32 q14, q14, q10 \n"// interpolate (step1)
				480	"vsub.s32 q15, q15, q11 \n"// interpolate (step1)
				481
				482	"vqrdmulh.s32 q12, q12, d2[0] \n"// interpolate (step2)
				483	"vqrdmulh.s32 q13, q13, d2[0] \n"// interpolate (step2)
				484	"vqrdmulh.s32 q14, q14, d2[0] \n"// interpolate (step2)
				485	"vqrdmulh.s32 q15, q15, d2[0] \n"// interpolate (step2)
				486
				487	"vadd.s32 q8, q8, q12 \n"// interpolate (step3)
				488	"vadd.s32 q9, q9, q13 \n"// interpolate (step3)
				489	"vadd.s32 q10, q10, q14 \n"// interpolate (step3)
				490	"vadd.s32 q11, q11, q15 \n"// interpolate (step3)
				491
				492	"vrev64.16 q2, q2 \n"// reverse 8 frames of the positive side
				493
				494	"vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
				495	"vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
				496
				497	"vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits
				498	"vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits
				499
				500	"vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef
				501	"vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef
				502	"vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
				503	"vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef
				504
				505	"vadd.s32 q0, q0, q12 \n"// accumulate result
				506	"vadd.s32 q13, q13, q14 \n"// accumulate result
				507	"vadd.s32 q0, q0, q15 \n"// accumulate result
				508	"vadd.s32 q0, q0, q13 \n"// accumulate result
				509
				510	"sub %[sP], %[sP], #16 \n"// move pointer to next set of samples
				511	"subs %[count], %[count], #8 \n"// update loop counter
				512
				513	"bne 1b \n"// loop
				514
				515	ASSEMBLY_ACCUMULATE_MONO
				516
				517	: [out] "=Uv" (out[0]),
				518	[count] "+r" (count),
				519	[coefsP0] "+r" (coefsP),
				520	[coefsN0] "+r" (coefsN),
				521	[coefsP1] "+r" (coefsP1),
				522	[coefsN1] "+r" (coefsN1),
				523	[sP] "+r" (sP),
				524	[sN] "+r" (sN)
				525	: [lerpP] "r" (lerpP),
				526	[vLR] "r" (volumeLR)
				527	: "cc", "memory",
				528	"q0", "q1", "q2", "q3",
				529	"q8", "q9", "q10", "q11",
				530	"q12", "q13", "q14", "q15"
				531	);
				532	}
				533
				534	template <>
				535	inline void Process<2, 16>(int32_t* const out,
				536	int count,
				537	const int32_t* coefsP,
				538	const int32_t* coefsN,
				539	const int32_t* coefsP1,
				540	const int32_t* coefsN1,
				541	const int16_t* sP,
				542	const int16_t* sN,
				543	uint32_t lerpP,
				544	const int32_t* const volumeLR)
				545	{
				546	const int CHANNELS = 2; // template specialization does not preserve params
				547	const int STRIDE = 16;
				548	sP -= CHANNELS*((STRIDE>>1)-1);
				549	asm (
				550	"vmov.32 d2[0], %[lerpP] \n"// load the positive phase
				551	"veor q0, q0, q0 \n"// result, initialize to 0
				552	"veor q4, q4, q4 \n"// result, initialize to 0
				553
				554	"1: \n"
				555
				556	"vld2.16 {q2, q3}, [%[sP]] \n"// load 4 16-bits stereo samples
				557	"vld2.16 {q5, q6}, [%[sN]]! \n"// load 4 16-bits stereo samples
				558	"vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs
				559	"vld1.32 {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
				560	"vld1.32 {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
				561	"vld1.32 {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
				562
				563	"vsub.s32 q12, q12, q8 \n"// interpolate (step1)
				564	"vsub.s32 q13, q13, q9 \n"// interpolate (step1)
				565	"vsub.s32 q14, q14, q10 \n"// interpolate (step1)
				566	"vsub.s32 q15, q15, q11 \n"// interpolate (step1)
				567
				568	"vqrdmulh.s32 q12, q12, d2[0] \n"// interpolate (step2)
				569	"vqrdmulh.s32 q13, q13, d2[0] \n"// interpolate (step2)
				570	"vqrdmulh.s32 q14, q14, d2[0] \n"// interpolate (step2)
				571	"vqrdmulh.s32 q15, q15, d2[0] \n"// interpolate (step2)
				572
				573	"vadd.s32 q8, q8, q12 \n"// interpolate (step3)
				574	"vadd.s32 q9, q9, q13 \n"// interpolate (step3)
				575	"vadd.s32 q10, q10, q14 \n"// interpolate (step3)
				576	"vadd.s32 q11, q11, q15 \n"// interpolate (step3)
				577
				578	"vrev64.16 q2, q2 \n"// reverse 8 frames of the positive side
				579	"vrev64.16 q3, q3 \n"// reverse 8 frames of the positive side
				580
				581	"vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
				582	"vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
				583
				584	"vshll.s16 q14, d10, #15 \n"// extend samples to 31 bits
				585	"vshll.s16 q15, d11, #15 \n"// extend samples to 31 bits
				586
				587	"vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef
				588	"vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef
				589	"vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
				590	"vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef
				591
				592	"vadd.s32 q0, q0, q12 \n"// accumulate result
				593	"vadd.s32 q13, q13, q14 \n"// accumulate result
				594	"vadd.s32 q0, q0, q15 \n"// (+1) accumulate result
				595	"vadd.s32 q0, q0, q13 \n"// (+1) accumulate result
				596
				597	"vshll.s16 q12, d6, #15 \n"// extend samples to 31 bits
				598	"vshll.s16 q13, d7, #15 \n"// extend samples to 31 bits
				599
				600	"vshll.s16 q14, d12, #15 \n"// extend samples to 31 bits
				601	"vshll.s16 q15, d13, #15 \n"// extend samples to 31 bits
				602
				603	"vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef
				604	"vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef
				605	"vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
				606	"vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef
				607
				608	"vadd.s32 q4, q4, q12 \n"// accumulate result
				609	"vadd.s32 q13, q13, q14 \n"// accumulate result
				610	"vadd.s32 q4, q4, q15 \n"// (+1) accumulate result
				611	"vadd.s32 q4, q4, q13 \n"// (+1) accumulate result
				612
				613	"subs %[count], %[count], #8 \n"// update loop counter
				614	"sub %[sP], %[sP], #32 \n"// move pointer to next set of samples
				615
				616	"bne 1b \n"// loop
				617
				618	ASSEMBLY_ACCUMULATE_STEREO
				619
				620	: [out] "=Uv" (out[0]),
				621	[count] "+r" (count),
				622	[coefsP0] "+r" (coefsP),
				623	[coefsN0] "+r" (coefsN),
				624	[coefsP1] "+r" (coefsP1),
				625	[coefsN1] "+r" (coefsN1),
				626	[sP] "+r" (sP),
				627	[sN] "+r" (sN)
				628	: [lerpP] "r" (lerpP),
				629	[vLR] "r" (volumeLR)
				630	: "cc", "memory",
				631	"q0", "q1", "q2", "q3",
				632	"q4", "q5", "q6",
				633	"q8", "q9", "q10", "q11",
				634	"q12", "q13", "q14", "q15"
				635	);
				636	}
				637
				638	template <>
				639	inline void ProcessL<1, 8>(int32_t* const out,
				640	int count,
				641	const int16_t* coefsP,
				642	const int16_t* coefsN,
				643	const int16_t* sP,
				644	const int16_t* sN,
				645	const int32_t* const volumeLR)
				646	{
				647	const int CHANNELS = 1; // template specialization does not preserve params
				648	const int STRIDE = 8;
				649	sP -= CHANNELS*((STRIDE>>1)-1);
				650	asm (
				651	"veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0
				652
				653	"1: \n"
				654
				655	"vld1.16 {d4}, [%[sP]] \n"// (2+0d) load 4 16-bits mono samples
				656	"vld1.16 {d6}, [%[sN]]! \n"// (2) load 4 16-bits mono samples
				657	"vld1.16 {d16}, [%[coefsP0]:64]! \n"// (1) load 4 16-bits coefs
				658	"vld1.16 {d20}, [%[coefsN0]:64]! \n"// (1) load 4 16-bits coefs
				659
				660	"vrev64.16 d4, d4 \n"// (1) reversed s3, s2, s1, s0, s7, s6, s5, s4
				661
				662	// reordering the vmal to do d6, d7 before d4, d5 is slower(?)
				663	"vmlal.s16 q0, d4, d16 \n"// (1) multiply (reversed)samples by coef
				664	"vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples
				665
				666	// moving these ARM instructions before neon above seems to be slower
				667	"subs %[count], %[count], #4 \n"// (1) update loop counter
				668	"sub %[sP], %[sP], #8 \n"// (0) move pointer to next set of samples
				669
				670	// sP used after branch (warning)
				671	"bne 1b \n"// loop
				672
				673	ASSEMBLY_ACCUMULATE_MONO
				674
				675	: [out] "=Uv" (out[0]),
				676	[count] "+r" (count),
				677	[coefsP0] "+r" (coefsP),
				678	[coefsN0] "+r" (coefsN),
				679	[sP] "+r" (sP),
				680	[sN] "+r" (sN)
				681	: [vLR] "r" (volumeLR)
				682	: "cc", "memory",
				683	"q0", "q1", "q2", "q3",
				684	"q8", "q10"
				685	);
				686	}
				687
				688	template <>
				689	inline void ProcessL<2, 8>(int32_t* const out,
				690	int count,
				691	const int16_t* coefsP,
				692	const int16_t* coefsN,
				693	const int16_t* sP,
				694	const int16_t* sN,
				695	const int32_t* const volumeLR)
				696	{
				697	const int CHANNELS = 2; // template specialization does not preserve params
				698	const int STRIDE = 8;
				699	sP -= CHANNELS*((STRIDE>>1)-1);
				700	asm (
				701	"veor q0, q0, q0 \n"// (1) acc_L = 0
				702	"veor q4, q4, q4 \n"// (0 combines+) acc_R = 0
				703
				704	"1: \n"
				705
				706	"vld2.16 {d4, d5}, [%[sP]] \n"// (2+0d) load 8 16-bits stereo samples
				707	"vld2.16 {d6, d7}, [%[sN]]! \n"// (2) load 8 16-bits stereo samples
				708	"vld1.16 {d16}, [%[coefsP0]:64]! \n"// (1) load 8 16-bits coefs
				709	"vld1.16 {d20}, [%[coefsN0]:64]! \n"// (1) load 8 16-bits coefs
				710
				711	"vrev64.16 q2, q2 \n"// (1) reverse 8 frames of the left positive
				712
				713	"vmlal.s16 q0, d4, d16 \n"// (1) multiply (reversed) samples left
				714	"vmlal.s16 q4, d5, d16 \n"// (1) multiply (reversed) samples right
				715	"vmlal.s16 q0, d6, d20 \n"// (1) multiply samples left
				716	"vmlal.s16 q4, d7, d20 \n"// (1) multiply samples right
				717
				718	// moving these ARM before neon seems to be slower
				719	"subs %[count], %[count], #4 \n"// (1) update loop counter
				720	"sub %[sP], %[sP], #16 \n"// (0) move pointer to next set of samples
				721
				722	// sP used after branch (warning)
				723	"bne 1b \n"// loop
				724
				725	ASSEMBLY_ACCUMULATE_STEREO
				726
				727	: [out] "=Uv" (out[0]),
				728	[count] "+r" (count),
				729	[coefsP0] "+r" (coefsP),
				730	[coefsN0] "+r" (coefsN),
				731	[sP] "+r" (sP),
				732	[sN] "+r" (sN)
				733	: [vLR] "r" (volumeLR)
				734	: "cc", "memory",
				735	"q0", "q1", "q2", "q3",
				736	"q4", "q5", "q6",
				737	"q8", "q10"
				738	);
				739	}
				740
				741	template <>
				742	inline void Process<1, 8>(int32_t* const out,
				743	int count,
				744	const int16_t* coefsP,
				745	const int16_t* coefsN,
				746	const int16_t* coefsP1,
				747	const int16_t* coefsN1,
				748	const int16_t* sP,
				749	const int16_t* sN,
				750	uint32_t lerpP,
				751	const int32_t* const volumeLR)
				752	{
				753	const int CHANNELS = 1; // template specialization does not preserve params
				754	const int STRIDE = 8;
				755	sP -= CHANNELS*((STRIDE>>1)-1);
				756	asm (
				757	"vmov.32 d2[0], %[lerpP] \n"// load the positive phase S32 Q15
				758	"veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0
				759
				760	"1: \n"
				761
				762	"vld1.16 {d4}, [%[sP]] \n"// (2+0d) load 4 16-bits mono samples
				763	"vld1.16 {d6}, [%[sN]]! \n"// (2) load 4 16-bits mono samples
				764	"vld1.16 {d16}, [%[coefsP0]:64]! \n"// (1) load 4 16-bits coefs
				765	"vld1.16 {d17}, [%[coefsP1]:64]! \n"// (1) load 4 16-bits coefs for interpolation
				766	"vld1.16 {d20}, [%[coefsN1]:64]! \n"// (1) load 4 16-bits coefs
				767	"vld1.16 {d21}, [%[coefsN0]:64]! \n"// (1) load 4 16-bits coefs for interpolation
				768
				769	"vsub.s16 d17, d17, d16 \n"// (1) interpolate (step1) 1st set of coefs
				770	"vsub.s16 d21, d21, d20 \n"// (1) interpolate (step1) 2nd set of coets
				771
				772	"vqrdmulh.s16 d17, d17, d2[0] \n"// (2) interpolate (step2) 1st set of coefs
				773	"vqrdmulh.s16 d21, d21, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs
				774
				775	"vrev64.16 d4, d4 \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
				776
				777	"vadd.s16 d16, d16, d17 \n"// (1+2d) interpolate (step3) 1st set
				778	"vadd.s16 d20, d20, d21 \n"// (1+1d) interpolate (step3) 2nd set
				779
				780	// reordering the vmal to do d6, d7 before d4, d5 is slower(?)
				781	"vmlal.s16 q0, d4, d16 \n"// (1+0d) multiply (reversed)by coef
				782	"vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples
				783
				784	// moving these ARM instructions before neon above seems to be slower
				785	"subs %[count], %[count], #4 \n"// (1) update loop counter
				786	"sub %[sP], %[sP], #8 \n"// move pointer to next set of samples
				787
				788	// sP used after branch (warning)
				789	"bne 1b \n"// loop
				790
				791	ASSEMBLY_ACCUMULATE_MONO
				792
				793	: [out] "=Uv" (out[0]),
				794	[count] "+r" (count),
				795	[coefsP0] "+r" (coefsP),
				796	[coefsN0] "+r" (coefsN),
				797	[coefsP1] "+r" (coefsP1),
				798	[coefsN1] "+r" (coefsN1),
				799	[sP] "+r" (sP),
				800	[sN] "+r" (sN)
				801	: [lerpP] "r" (lerpP),
				802	[vLR] "r" (volumeLR)
				803	: "cc", "memory",
				804	"q0", "q1", "q2", "q3",
				805	"q8", "q9", "q10", "q11"
				806	);
				807	}
				808
				809	template <>
				810	inline void Process<2, 8>(int32_t* const out,
				811	int count,
				812	const int16_t* coefsP,
				813	const int16_t* coefsN,
				814	const int16_t* coefsP1,
				815	const int16_t* coefsN1,
				816	const int16_t* sP,
				817	const int16_t* sN,
				818	uint32_t lerpP,
				819	const int32_t* const volumeLR)
				820	{
				821	const int CHANNELS = 2; // template specialization does not preserve params
				822	const int STRIDE = 8;
				823	sP -= CHANNELS*((STRIDE>>1)-1);
				824	asm (
				825	"vmov.32 d2[0], %[lerpP] \n"// load the positive phase
				826	"veor q0, q0, q0 \n"// (1) acc_L = 0
				827	"veor q4, q4, q4 \n"// (0 combines+) acc_R = 0
				828
				829	"1: \n"
				830
				831	"vld2.16 {d4, d5}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo samples
				832	"vld2.16 {d6, d7}, [%[sN]]! \n"// (3) load 8 16-bits stereo samples
				833	"vld1.16 {d16}, [%[coefsP0]:64]! \n"// (1) load 8 16-bits coefs
				834	"vld1.16 {d17}, [%[coefsP1]:64]! \n"// (1) load 8 16-bits coefs for interpolation
				835	"vld1.16 {d20}, [%[coefsN1]:64]! \n"// (1) load 8 16-bits coefs
				836	"vld1.16 {d21}, [%[coefsN0]:64]! \n"// (1) load 8 16-bits coefs for interpolation
				837
				838	"vsub.s16 d17, d17, d16 \n"// (1) interpolate (step1) 1st set of coefs
				839	"vsub.s16 d21, d21, d20 \n"// (1) interpolate (step1) 2nd set of coets
				840
				841	"vqrdmulh.s16 d17, d17, d2[0] \n"// (2) interpolate (step2) 1st set of coefs
				842	"vqrdmulh.s16 d21, d21, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs
				843
				844	"vrev64.16 q2, q2 \n"// (1) reverse 8 frames of the left positive
				845
				846	"vadd.s16 d16, d16, d17 \n"// (1+1d) interpolate (step3) 1st set
				847	"vadd.s16 d20, d20, d21 \n"// (1+1d) interpolate (step3) 2nd set
				848
				849	"vmlal.s16 q0, d4, d16 \n"// (1) multiply (reversed) samples left
				850	"vmlal.s16 q4, d5, d16 \n"// (1) multiply (reversed) samples right
				851	"vmlal.s16 q0, d6, d20 \n"// (1) multiply samples left
				852	"vmlal.s16 q4, d7, d20 \n"// (1) multiply samples right
				853
				854	// moving these ARM before neon seems to be slower
				855	"subs %[count], %[count], #4 \n"// (1) update loop counter
				856	"sub %[sP], %[sP], #16 \n"// move pointer to next set of samples
				857
				858	// sP used after branch (warning)
				859	"bne 1b \n"// loop
				860
				861	ASSEMBLY_ACCUMULATE_STEREO
				862
				863	: [out] "=Uv" (out[0]),
				864	[count] "+r" (count),
				865	[coefsP0] "+r" (coefsP),
				866	[coefsN0] "+r" (coefsN),
				867	[coefsP1] "+r" (coefsP1),
				868	[coefsN1] "+r" (coefsN1),
				869	[sP] "+r" (sP),
				870	[sN] "+r" (sN)
				871	: [lerpP] "r" (lerpP),
				872	[vLR] "r" (volumeLR)
				873	: "cc", "memory",
				874	"q0", "q1", "q2", "q3",
				875	"q4", "q5", "q6",
				876	"q8", "q9", "q10", "q11"
				877	);
				878	}
				879
				880	template <>
				881	inline void ProcessL<1, 8>(int32_t* const out,
				882	int count,
				883	const int32_t* coefsP,
				884	const int32_t* coefsN,
				885	const int16_t* sP,
				886	const int16_t* sN,
				887	const int32_t* const volumeLR)
				888	{
				889	const int CHANNELS = 1; // template specialization does not preserve params
				890	const int STRIDE = 8;
				891	sP -= CHANNELS*((STRIDE>>1)-1);
				892	asm (
				893	"veor q0, q0, q0 \n"// result, initialize to 0
				894
				895	"1: \n"
				896
				897	"vld1.16 {d4}, [%[sP]] \n"// load 4 16-bits mono samples
				898	"vld1.16 {d6}, [%[sN]]! \n"// load 4 16-bits mono samples
				899	"vld1.32 {q8}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs
				900	"vld1.32 {q10}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs
				901
				902	"vrev64.16 d4, d4 \n"// reverse 2 frames of the positive side
				903
				904	"vshll.s16 q12, d4, #15 \n"// (stall) extend samples to 31 bits
				905	"vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits
				906
				907	"vqrdmulh.s32 q12, q12, q8 \n"// multiply samples by interpolated coef
				908	"vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
				909
				910	"vadd.s32 q0, q0, q12 \n"// accumulate result
				911	"vadd.s32 q0, q0, q14 \n"// (stall) accumulate result
				912
				913	"subs %[count], %[count], #4 \n"// update loop counter
				914	"sub %[sP], %[sP], #8 \n"// move pointer to next set of samples
				915
				916	"bne 1b \n"// loop
				917
				918	ASSEMBLY_ACCUMULATE_MONO
				919
				920	: [out] "=Uv" (out[0]),
				921	[count] "+r" (count),
				922	[coefsP0] "+r" (coefsP),
				923	[coefsN0] "+r" (coefsN),
				924	[sP] "+r" (sP),
				925	[sN] "+r" (sN)
				926	: [vLR] "r" (volumeLR)
				927	: "cc", "memory",
				928	"q0", "q1", "q2", "q3",
				929	"q8", "q9", "q10", "q11",
				930	"q12", "q14"
				931	);
				932	}
				933
				934	template <>
				935	inline void ProcessL<2, 8>(int32_t* const out,
				936	int count,
				937	const int32_t* coefsP,
				938	const int32_t* coefsN,
				939	const int16_t* sP,
				940	const int16_t* sN,
				941	const int32_t* const volumeLR)
				942	{
				943	const int CHANNELS = 2; // template specialization does not preserve params
				944	const int STRIDE = 8;
				945	sP -= CHANNELS*((STRIDE>>1)-1);
				946	asm (
				947	"veor q0, q0, q0 \n"// result, initialize to 0
				948	"veor q4, q4, q4 \n"// result, initialize to 0
				949
				950	"1: \n"
				951
				952	"vld2.16 {d4, d5}, [%[sP]] \n"// load 4 16-bits stereo samples
				953	"vld2.16 {d6, d7}, [%[sN]]! \n"// load 4 16-bits stereo samples
				954	"vld1.32 {q8}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs
				955	"vld1.32 {q10}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs
				956
				957	"vrev64.16 q2, q2 \n"// reverse 2 frames of the positive side
				958
				959	"vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
				960	"vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
				961
				962	"vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits
				963	"vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits
				964
				965	"vqrdmulh.s32 q12, q12, q8 \n"// multiply samples by coef
				966	"vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by coef
				967	"vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by coef
				968	"vqrdmulh.s32 q15, q15, q10 \n"// multiply samples by coef
				969
				970	"vadd.s32 q0, q0, q12 \n"// accumulate result
				971	"vadd.s32 q4, q4, q13 \n"// accumulate result
				972	"vadd.s32 q0, q0, q14 \n"// accumulate result
				973	"vadd.s32 q4, q4, q15 \n"// accumulate result
				974
				975	"subs %[count], %[count], #4 \n"// update loop counter
				976	"sub %[sP], %[sP], #16 \n"// move pointer to next set of samples
				977
				978	"bne 1b \n"// loop
				979
				980	ASSEMBLY_ACCUMULATE_STEREO
				981
				982	: [out] "=Uv" (out[0]),
				983	[count] "+r" (count),
				984	[coefsP0] "+r" (coefsP),
				985	[coefsN0] "+r" (coefsN),
				986	[sP] "+r" (sP),
				987	[sN] "+r" (sN)
				988	: [vLR] "r" (volumeLR)
				989	: "cc", "memory",
				990	"q0", "q1", "q2", "q3", "q4",
				991	"q8", "q9", "q10", "q11",
				992	"q12", "q13", "q14", "q15"
				993	);
				994	}
				995
				996	template <>
				997	inline void Process<1, 8>(int32_t* const out,
				998	int count,
				999	const int32_t* coefsP,
				1000	const int32_t* coefsN,
				1001	const int32_t* coefsP1,
				1002	const int32_t* coefsN1,
				1003	const int16_t* sP,
				1004	const int16_t* sN,
				1005	uint32_t lerpP,
				1006	const int32_t* const volumeLR)
				1007	{
				1008	const int CHANNELS = 1; // template specialization does not preserve params
				1009	const int STRIDE = 8;
				1010	sP -= CHANNELS*((STRIDE>>1)-1);
				1011	asm (
				1012	"vmov.32 d2[0], %[lerpP] \n"// load the positive phase
				1013	"veor q0, q0, q0 \n"// result, initialize to 0
				1014
				1015	"1: \n"
				1016
				1017	"vld1.16 {d4}, [%[sP]] \n"// load 4 16-bits mono samples
				1018	"vld1.16 {d6}, [%[sN]]! \n"// load 4 16-bits mono samples
				1019	"vld1.32 {q8}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs
				1020	"vld1.32 {q9}, [%[coefsP1]:128]! \n"// load 4 32-bits coefs for interpolation
				1021	"vld1.32 {q10}, [%[coefsN1]:128]! \n"// load 4 32-bits coefs
				1022	"vld1.32 {q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs for interpolation
				1023
				1024	"vrev64.16 d4, d4 \n"// reverse 2 frames of the positive side
				1025
				1026	"vsub.s32 q9, q9, q8 \n"// interpolate (step1) 1st set of coefs
				1027	"vsub.s32 q11, q11, q10 \n"// interpolate (step1) 2nd set of coets
				1028	"vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
				1029
				1030	"vqrdmulh.s32 q9, q9, d2[0] \n"// interpolate (step2) 1st set of coefs
				1031	"vqrdmulh.s32 q11, q11, d2[0] \n"// interpolate (step2) 2nd set of coefs
				1032	"vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits
				1033
				1034	"vadd.s32 q8, q8, q9 \n"// interpolate (step3) 1st set
				1035	"vadd.s32 q10, q10, q11 \n"// interpolate (step4) 2nd set
				1036
				1037	"vqrdmulh.s32 q12, q12, q8 \n"// multiply samples by interpolated coef
				1038	"vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
				1039
				1040	"vadd.s32 q0, q0, q12 \n"// accumulate result
				1041	"vadd.s32 q0, q0, q14 \n"// accumulate result
				1042
				1043	"subs %[count], %[count], #4 \n"// update loop counter
				1044	"sub %[sP], %[sP], #8 \n"// move pointer to next set of samples
				1045
				1046	"bne 1b \n"// loop
				1047
				1048	ASSEMBLY_ACCUMULATE_MONO
				1049
				1050	: [out] "=Uv" (out[0]),
				1051	[count] "+r" (count),
				1052	[coefsP0] "+r" (coefsP),
				1053	[coefsP1] "+r" (coefsP1),
				1054	[coefsN0] "+r" (coefsN),
				1055	[coefsN1] "+r" (coefsN1),
				1056	[sP] "+r" (sP),
				1057	[sN] "+r" (sN)
				1058	: [lerpP] "r" (lerpP),
				1059	[vLR] "r" (volumeLR)
				1060	: "cc", "memory",
				1061	"q0", "q1", "q2", "q3",
				1062	"q8", "q9", "q10", "q11",
				1063	"q12", "q14"
				1064	);
				1065	}
				1066
				1067	template <>
				1068	inline
				1069	void Process<2, 8>(int32_t* const out,
				1070	int count,
				1071	const int32_t* coefsP,
				1072	const int32_t* coefsN,
				1073	const int32_t* coefsP1,
				1074	const int32_t* coefsN1,
				1075	const int16_t* sP,
				1076	const int16_t* sN,
				1077	uint32_t lerpP,
				1078	const int32_t* const volumeLR)
				1079	{
				1080	const int CHANNELS = 2; // template specialization does not preserve params
				1081	const int STRIDE = 8;
				1082	sP -= CHANNELS*((STRIDE>>1)-1);
				1083	asm (
				1084	"vmov.32 d2[0], %[lerpP] \n"// load the positive phase
				1085	"veor q0, q0, q0 \n"// result, initialize to 0
				1086	"veor q4, q4, q4 \n"// result, initialize to 0
				1087
				1088	"1: \n"
				1089	"vld2.16 {d4, d5}, [%[sP]] \n"// load 4 16-bits stereo samples
				1090	"vld2.16 {d6, d7}, [%[sN]]! \n"// load 4 16-bits stereo samples
				1091	"vld1.32 {q8}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs
				1092	"vld1.32 {q9}, [%[coefsP1]:128]! \n"// load 4 32-bits coefs for interpolation
				1093	"vld1.32 {q10}, [%[coefsN1]:128]! \n"// load 4 32-bits coefs
				1094	"vld1.32 {q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs for interpolation
				1095
				1096	"vrev64.16 q2, q2 \n"// (reversed) 2 frames of the positive side
				1097
				1098	"vsub.s32 q9, q9, q8 \n"// interpolate (step1) 1st set of coefs
				1099	"vsub.s32 q11, q11, q10 \n"// interpolate (step1) 2nd set of coets
				1100	"vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
				1101	"vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
				1102
				1103	"vqrdmulh.s32 q9, q9, d2[0] \n"// interpolate (step2) 1st set of coefs
				1104	"vqrdmulh.s32 q11, q11, d2[1] \n"// interpolate (step3) 2nd set of coefs
				1105	"vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits
				1106	"vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits
				1107
				1108	"vadd.s32 q8, q8, q9 \n"// interpolate (step3) 1st set
				1109	"vadd.s32 q10, q10, q11 \n"// interpolate (step4) 2nd set
				1110
				1111	"vqrdmulh.s32 q12, q12, q8 \n"// multiply samples by interpolated coef
				1112	"vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef
				1113	"vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
				1114	"vqrdmulh.s32 q15, q15, q10 \n"// multiply samples by interpolated coef
				1115
				1116	"vadd.s32 q0, q0, q12 \n"// accumulate result
				1117	"vadd.s32 q4, q4, q13 \n"// accumulate result
				1118	"vadd.s32 q0, q0, q14 \n"// accumulate result
				1119	"vadd.s32 q4, q4, q15 \n"// accumulate result
				1120
				1121	"subs %[count], %[count], #4 \n"// update loop counter
				1122	"sub %[sP], %[sP], #16 \n"// move pointer to next set of samples
				1123
				1124	"bne 1b \n"// loop
				1125
				1126	ASSEMBLY_ACCUMULATE_STEREO
				1127
				1128	: [out] "=Uv" (out[0]),
				1129	[count] "+r" (count),
				1130	[coefsP0] "+r" (coefsP),
				1131	[coefsP1] "+r" (coefsP1),
				1132	[coefsN0] "+r" (coefsN),
				1133	[coefsN1] "+r" (coefsN1),
				1134	[sP] "+r" (sP),
				1135	[sN] "+r" (sN)
				1136	: [lerpP] "r" (lerpP),
				1137	[vLR] "r" (volumeLR)
				1138	: "cc", "memory",
				1139	"q0", "q1", "q2", "q3", "q4",
				1140	"q8", "q9", "q10", "q11",
				1141	"q12", "q13", "q14", "q15"
				1142	);
				1143	}
				1144
				1145	#endif //USE_NEON
				1146
				1147	}; // namespace android
				1148
				1149	#endif /ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H/