Blame - media/libaudioprocessing/AudioResamplerFirProcessNeon.h - android_frameworks_av

blob: c335050d7850a79b3ecf625b093617cb221a8849 [file] [log] [blame]

Andy Hung	86eae0e	2013-12-09 12:12:46 -0800	[diff] [blame]	1	/*
				2	* Copyright (C) 2013 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
				17	#ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
				18	#define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
				19
				20	namespace android {
				21
				22	// depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h
				23
				24	#if USE_NEON
Andy Hung	6b667dd	2015-02-06 15:05:37 -0800	[diff] [blame]	25
				26	// use intrinsics if inline arm32 assembly is not possible
				27	#if !USE_INLINE_ASSEMBLY
				28	#define USE_INTRINSIC
				29	#endif
				30
				31	// following intrinsics available only on ARM 64 bit ACLE
				32	#ifndef __aarch64__
				33	#undef vld1q_f32_x2
				34	#undef vld1q_s32_x2
				35	#endif
				36
				37	#define TO_STRING2(x) #x
				38	#define TO_STRING(x) TO_STRING2(x)
				39	// uncomment to print GCC version, may be relevant for intrinsic optimizations
				40	/* #pragma message ("GCC version: " TO_STRING(__GNUC__) \
				41	"." TO_STRING(__GNUC_MINOR__) \
				42	"." TO_STRING(__GNUC_PATCHLEVEL__)) */
				43
Andy Hung	86eae0e	2013-12-09 12:12:46 -0800	[diff] [blame]	44	//
Andy Hung	6b667dd	2015-02-06 15:05:37 -0800	[diff] [blame]	45	// NEON specializations are enabled for Process() and ProcessL() in AudioResamplerFirProcess.h
				46	//
				47	// Two variants are presented here:
				48	// ARM NEON inline assembly which appears up to 10-15% faster than intrinsics (gcc 4.9) for arm32.
				49	// ARM NEON intrinsics which can also be used by arm64 and x86/64 with NEON header.
				50	//
Andy Hung	86eae0e	2013-12-09 12:12:46 -0800	[diff] [blame]	51
				52	// Macros to save a mono/stereo accumulator sample in q0 (and q4) as stereo out.
Andy Hung	6b667dd	2015-02-06 15:05:37 -0800	[diff] [blame]	53	// These are only used for inline assembly.
Andy Hung	86eae0e	2013-12-09 12:12:46 -0800	[diff] [blame]	54	#define ASSEMBLY_ACCUMULATE_MONO \
				55	"vld1.s32 {d2}, [%[vLR]:64] \n"/* (1) load volumes */\
				56	"vld1.s32 {d3}, %[out] \n"/* (2) unaligned load the output */\
				57	"vpadd.s32 d0, d0, d1 \n"/* (1) add all 4 partial sums */\
				58	"vpadd.s32 d0, d0, d0 \n"/* (1+4d) and replicate L/R */\
				59	"vqrdmulh.s32 d0, d0, d2 \n"/* (2+3d) apply volume */\
				60	"vqadd.s32 d3, d3, d0 \n"/* (1+4d) accumulate result (saturating) */\
				61	"vst1.s32 {d3}, %[out] \n"/* (2+2d) store result */
				62
				63	#define ASSEMBLY_ACCUMULATE_STEREO \
				64	"vld1.s32 {d2}, [%[vLR]:64] \n"/* (1) load volumes*/\
				65	"vld1.s32 {d3}, %[out] \n"/* (2) unaligned load the output*/\
				66	"vpadd.s32 d0, d0, d1 \n"/* (1) add all 4 partial sums from q0*/\
				67	"vpadd.s32 d8, d8, d9 \n"/* (1) add all 4 partial sums from q4*/\
				68	"vpadd.s32 d0, d0, d8 \n"/* (1+4d) combine into L/R*/\
				69	"vqrdmulh.s32 d0, d0, d2 \n"/* (2+3d) apply volume*/\
				70	"vqadd.s32 d3, d3, d0 \n"/* (1+4d) accumulate result (saturating)*/\
				71	"vst1.s32 {d3}, %[out] \n"/* (2+2d)store result*/
				72
Andy Hung	6b667dd	2015-02-06 15:05:37 -0800	[diff] [blame]	73	template <int CHANNELS, int STRIDE, bool FIXED>
				74	static inline void ProcessNeonIntrinsic(int32_t* out,
				75	int count,
				76	const int16_t* coefsP,
				77	const int16_t* coefsN,
				78	const int16_t* sP,
				79	const int16_t* sN,
				80	const int32_t* volumeLR,
				81	uint32_t lerpP,
				82	const int16_t* coefsP1,
				83	const int16_t* coefsN1)
				84	{
				85	ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
Glenn Kasten	91164e7	2016-03-15 15:55:01 -0700	[diff] [blame]	86	static_assert(CHANNELS == 1 \|\| CHANNELS == 2, "CHANNELS must be 1 or 2");
Andy Hung	6b667dd	2015-02-06 15:05:37 -0800	[diff] [blame]	87
				88	sP -= CHANNELS*((STRIDE>>1)-1);
				89	coefsP = (const int16_t*)__builtin_assume_aligned(coefsP, 16);
				90	coefsN = (const int16_t*)__builtin_assume_aligned(coefsN, 16);
				91
				92	int16x4_t interp;
				93	if (!FIXED) {
				94	interp = vdup_n_s16(lerpP);
				95	//interp = (int16x4_t)vset_lane_s32 ((int32x2_t)lerpP, interp, 0);
				96	coefsP1 = (const int16_t*)__builtin_assume_aligned(coefsP1, 16);
				97	coefsN1 = (const int16_t*)__builtin_assume_aligned(coefsN1, 16);
				98	}
				99	int32x4_t accum, accum2;
				100	// warning uninitialized if we use veorq_s32
				101	// (alternative to below) accum = veorq_s32(accum, accum);
				102	accum = vdupq_n_s32(0);
				103	if (CHANNELS == 2) {
				104	// (alternative to below) accum2 = veorq_s32(accum2, accum2);
				105	accum2 = vdupq_n_s32(0);
				106	}
				107	do {
				108	int16x8_t posCoef = vld1q_s16(coefsP);
				109	coefsP += 8;
				110	int16x8_t negCoef = vld1q_s16(coefsN);
				111	coefsN += 8;
				112	if (!FIXED) { // interpolate
				113	int16x8_t posCoef1 = vld1q_s16(coefsP1);
				114	coefsP1 += 8;
				115	int16x8_t negCoef1 = vld1q_s16(coefsN1);
				116	coefsN1 += 8;
				117
				118	posCoef1 = vsubq_s16(posCoef1, posCoef);
				119	negCoef = vsubq_s16(negCoef, negCoef1);
				120
				121	posCoef1 = vqrdmulhq_lane_s16(posCoef1, interp, 0);
				122	negCoef = vqrdmulhq_lane_s16(negCoef, interp, 0);
				123
				124	posCoef = vaddq_s16(posCoef, posCoef1);
				125	negCoef = vaddq_s16(negCoef, negCoef1);
				126	}
				127	switch (CHANNELS) {
				128	case 1: {
				129	int16x8_t posSamp = vld1q_s16(sP);
				130	int16x8_t negSamp = vld1q_s16(sN);
				131	sN += 8;
				132	posSamp = vrev64q_s16(posSamp);
				133
				134	// dot product
				135	accum = vmlal_s16(accum, vget_low_s16(posSamp), vget_high_s16(posCoef)); // reversed
				136	accum = vmlal_s16(accum, vget_high_s16(posSamp), vget_low_s16(posCoef)); // reversed
				137	accum = vmlal_s16(accum, vget_low_s16(negSamp), vget_low_s16(negCoef));
				138	accum = vmlal_s16(accum, vget_high_s16(negSamp), vget_high_s16(negCoef));
				139	sP -= 8;
				140	} break;
				141	case 2: {
				142	int16x8x2_t posSamp = vld2q_s16(sP);
				143	int16x8x2_t negSamp = vld2q_s16(sN);
				144	sN += 16;
				145	posSamp.val[0] = vrev64q_s16(posSamp.val[0]);
				146	posSamp.val[1] = vrev64q_s16(posSamp.val[1]);
				147
				148	// dot product
				149	accum = vmlal_s16(accum, vget_low_s16(posSamp.val[0]), vget_high_s16(posCoef)); // r
				150	accum = vmlal_s16(accum, vget_high_s16(posSamp.val[0]), vget_low_s16(posCoef)); // r
				151	accum2 = vmlal_s16(accum2, vget_low_s16(posSamp.val[1]), vget_high_s16(posCoef)); // r
				152	accum2 = vmlal_s16(accum2, vget_high_s16(posSamp.val[1]), vget_low_s16(posCoef)); // r
				153	accum = vmlal_s16(accum, vget_low_s16(negSamp.val[0]), vget_low_s16(negCoef));
				154	accum = vmlal_s16(accum, vget_high_s16(negSamp.val[0]), vget_high_s16(negCoef));
				155	accum2 = vmlal_s16(accum2, vget_low_s16(negSamp.val[1]), vget_low_s16(negCoef));
				156	accum2 = vmlal_s16(accum2, vget_high_s16(negSamp.val[1]), vget_high_s16(negCoef));
				157	sP -= 16;
Andy Hung	6b667dd	2015-02-06 15:05:37 -0800	[diff] [blame]	158	} break;
Andy Hung	51157ba	2017-01-05 16:59:20 -0800	[diff] [blame]	159	}
Andy Hung	6b667dd	2015-02-06 15:05:37 -0800	[diff] [blame]	160	} while (count -= 8);
				161
				162	// multiply by volume and save
				163	volumeLR = (const int32_t*)__builtin_assume_aligned(volumeLR, 8);
				164	int32x2_t vLR = vld1_s32(volumeLR);
				165	int32x2_t outSamp = vld1_s32(out);
				166	// combine and funnel down accumulator
				167	int32x2_t outAccum = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum));
				168	if (CHANNELS == 1) {
				169	// duplicate accum to both L and R
				170	outAccum = vpadd_s32(outAccum, outAccum);
				171	} else if (CHANNELS == 2) {
				172	// accum2 contains R, fold in
				173	int32x2_t outAccum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2));
				174	outAccum = vpadd_s32(outAccum, outAccum2);
				175	}
				176	outAccum = vqrdmulh_s32(outAccum, vLR);
				177	outSamp = vqadd_s32(outSamp, outAccum);
				178	vst1_s32(out, outSamp);
				179	}
				180
				181	template <int CHANNELS, int STRIDE, bool FIXED>
				182	static inline void ProcessNeonIntrinsic(int32_t* out,
				183	int count,
				184	const int32_t* coefsP,
				185	const int32_t* coefsN,
				186	const int16_t* sP,
				187	const int16_t* sN,
				188	const int32_t* volumeLR,
				189	uint32_t lerpP,
				190	const int32_t* coefsP1,
				191	const int32_t* coefsN1)
				192	{
				193	ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
Glenn Kasten	91164e7	2016-03-15 15:55:01 -0700	[diff] [blame]	194	static_assert(CHANNELS == 1 \|\| CHANNELS == 2, "CHANNELS must be 1 or 2");
Andy Hung	6b667dd	2015-02-06 15:05:37 -0800	[diff] [blame]	195
				196	sP -= CHANNELS*((STRIDE>>1)-1);
				197	coefsP = (const int32_t*)__builtin_assume_aligned(coefsP, 16);
				198	coefsN = (const int32_t*)__builtin_assume_aligned(coefsN, 16);
				199
				200	int32x2_t interp;
				201	if (!FIXED) {
				202	interp = vdup_n_s32(lerpP);
				203	coefsP1 = (const int32_t*)__builtin_assume_aligned(coefsP1, 16);
				204	coefsN1 = (const int32_t*)__builtin_assume_aligned(coefsN1, 16);
				205	}
				206	int32x4_t accum, accum2;
				207	// warning uninitialized if we use veorq_s32
				208	// (alternative to below) accum = veorq_s32(accum, accum);
				209	accum = vdupq_n_s32(0);
				210	if (CHANNELS == 2) {
				211	// (alternative to below) accum2 = veorq_s32(accum2, accum2);
				212	accum2 = vdupq_n_s32(0);
				213	}
				214	do {
				215	#ifdef vld1q_s32_x2
				216	int32x4x2_t posCoef = vld1q_s32_x2(coefsP);
				217	coefsP += 8;
				218	int32x4x2_t negCoef = vld1q_s32_x2(coefsN);
				219	coefsN += 8;
				220	#else
				221	int32x4x2_t posCoef;
				222	posCoef.val[0] = vld1q_s32(coefsP);
				223	coefsP += 4;
				224	posCoef.val[1] = vld1q_s32(coefsP);
				225	coefsP += 4;
				226	int32x4x2_t negCoef;
				227	negCoef.val[0] = vld1q_s32(coefsN);
				228	coefsN += 4;
				229	negCoef.val[1] = vld1q_s32(coefsN);
				230	coefsN += 4;
				231	#endif
				232	if (!FIXED) { // interpolate
				233	#ifdef vld1q_s32_x2
				234	int32x4x2_t posCoef1 = vld1q_s32_x2(coefsP1);
				235	coefsP1 += 8;
				236	int32x4x2_t negCoef1 = vld1q_s32_x2(coefsN1);
				237	coefsN1 += 8;
				238	#else
				239	int32x4x2_t posCoef1;
				240	posCoef1.val[0] = vld1q_s32(coefsP1);
				241	coefsP1 += 4;
				242	posCoef1.val[1] = vld1q_s32(coefsP1);
				243	coefsP1 += 4;
				244	int32x4x2_t negCoef1;
				245	negCoef1.val[0] = vld1q_s32(coefsN1);
				246	coefsN1 += 4;
				247	negCoef1.val[1] = vld1q_s32(coefsN1);
				248	coefsN1 += 4;
				249	#endif
				250
				251	posCoef1.val[0] = vsubq_s32(posCoef1.val[0], posCoef.val[0]);
				252	posCoef1.val[1] = vsubq_s32(posCoef1.val[1], posCoef.val[1]);
				253	negCoef.val[0] = vsubq_s32(negCoef.val[0], negCoef1.val[0]);
				254	negCoef.val[1] = vsubq_s32(negCoef.val[1], negCoef1.val[1]);
				255
				256	posCoef1.val[0] = vqrdmulhq_lane_s32(posCoef1.val[0], interp, 0);
				257	posCoef1.val[1] = vqrdmulhq_lane_s32(posCoef1.val[1], interp, 0);
				258	negCoef.val[0] = vqrdmulhq_lane_s32(negCoef.val[0], interp, 0);
				259	negCoef.val[1] = vqrdmulhq_lane_s32(negCoef.val[1], interp, 0);
				260
				261	posCoef.val[0] = vaddq_s32(posCoef.val[0], posCoef1.val[0]);
				262	posCoef.val[1] = vaddq_s32(posCoef.val[1], posCoef1.val[1]);
				263	negCoef.val[0] = vaddq_s32(negCoef.val[0], negCoef1.val[0]);
				264	negCoef.val[1] = vaddq_s32(negCoef.val[1], negCoef1.val[1]);
				265	}
				266	switch (CHANNELS) {
				267	case 1: {
				268	int16x8_t posSamp = vld1q_s16(sP);
				269	int16x8_t negSamp = vld1q_s16(sN);
				270	sN += 8;
				271	posSamp = vrev64q_s16(posSamp);
				272
				273	int32x4_t posSamp0 = vshll_n_s16(vget_low_s16(posSamp), 15);
				274	int32x4_t posSamp1 = vshll_n_s16(vget_high_s16(posSamp), 15);
				275	int32x4_t negSamp0 = vshll_n_s16(vget_low_s16(negSamp), 15);
				276	int32x4_t negSamp1 = vshll_n_s16(vget_high_s16(negSamp), 15);
				277
				278	// dot product
				279	posSamp0 = vqrdmulhq_s32(posSamp0, posCoef.val[1]); // reversed
				280	posSamp1 = vqrdmulhq_s32(posSamp1, posCoef.val[0]); // reversed
				281	negSamp0 = vqrdmulhq_s32(negSamp0, negCoef.val[0]);
				282	negSamp1 = vqrdmulhq_s32(negSamp1, negCoef.val[1]);
				283
				284	accum = vaddq_s32(accum, posSamp0);
				285	negSamp0 = vaddq_s32(negSamp0, negSamp1);
				286	accum = vaddq_s32(accum, posSamp1);
				287	accum = vaddq_s32(accum, negSamp0);
				288
				289	sP -= 8;
				290	} break;
				291	case 2: {
				292	int16x8x2_t posSamp = vld2q_s16(sP);
				293	int16x8x2_t negSamp = vld2q_s16(sN);
				294	sN += 16;
				295	posSamp.val[0] = vrev64q_s16(posSamp.val[0]);
				296	posSamp.val[1] = vrev64q_s16(posSamp.val[1]);
				297
				298	// left
				299	int32x4_t posSamp0 = vshll_n_s16(vget_low_s16(posSamp.val[0]), 15);
				300	int32x4_t posSamp1 = vshll_n_s16(vget_high_s16(posSamp.val[0]), 15);
				301	int32x4_t negSamp0 = vshll_n_s16(vget_low_s16(negSamp.val[0]), 15);
				302	int32x4_t negSamp1 = vshll_n_s16(vget_high_s16(negSamp.val[0]), 15);
				303
				304	// dot product
				305	posSamp0 = vqrdmulhq_s32(posSamp0, posCoef.val[1]); // reversed
				306	posSamp1 = vqrdmulhq_s32(posSamp1, posCoef.val[0]); // reversed
				307	negSamp0 = vqrdmulhq_s32(negSamp0, negCoef.val[0]);
				308	negSamp1 = vqrdmulhq_s32(negSamp1, negCoef.val[1]);
				309
				310	accum = vaddq_s32(accum, posSamp0);
				311	negSamp0 = vaddq_s32(negSamp0, negSamp1);
				312	accum = vaddq_s32(accum, posSamp1);
				313	accum = vaddq_s32(accum, negSamp0);
				314
				315	// right
				316	posSamp0 = vshll_n_s16(vget_low_s16(posSamp.val[1]), 15);
				317	posSamp1 = vshll_n_s16(vget_high_s16(posSamp.val[1]), 15);
				318	negSamp0 = vshll_n_s16(vget_low_s16(negSamp.val[1]), 15);
				319	negSamp1 = vshll_n_s16(vget_high_s16(negSamp.val[1]), 15);
				320
				321	// dot product
				322	posSamp0 = vqrdmulhq_s32(posSamp0, posCoef.val[1]); // reversed
				323	posSamp1 = vqrdmulhq_s32(posSamp1, posCoef.val[0]); // reversed
				324	negSamp0 = vqrdmulhq_s32(negSamp0, negCoef.val[0]);
				325	negSamp1 = vqrdmulhq_s32(negSamp1, negCoef.val[1]);
				326
				327	accum2 = vaddq_s32(accum2, posSamp0);
				328	negSamp0 = vaddq_s32(negSamp0, negSamp1);
				329	accum2 = vaddq_s32(accum2, posSamp1);
				330	accum2 = vaddq_s32(accum2, negSamp0);
				331
				332	sP -= 16;
				333	} break;
				334	}
				335	} while (count -= 8);
				336
				337	// multiply by volume and save
				338	volumeLR = (const int32_t*)__builtin_assume_aligned(volumeLR, 8);
				339	int32x2_t vLR = vld1_s32(volumeLR);
				340	int32x2_t outSamp = vld1_s32(out);
				341	// combine and funnel down accumulator
				342	int32x2_t outAccum = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum));
				343	if (CHANNELS == 1) {
				344	// duplicate accum to both L and R
				345	outAccum = vpadd_s32(outAccum, outAccum);
				346	} else if (CHANNELS == 2) {
				347	// accum2 contains R, fold in
				348	int32x2_t outAccum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2));
				349	outAccum = vpadd_s32(outAccum, outAccum2);
				350	}
				351	outAccum = vqrdmulh_s32(outAccum, vLR);
				352	outSamp = vqadd_s32(outSamp, outAccum);
				353	vst1_s32(out, outSamp);
				354	}
				355
				356	template <int CHANNELS, int STRIDE, bool FIXED>
				357	static inline void ProcessNeonIntrinsic(float* out,
				358	int count,
				359	const float* coefsP,
				360	const float* coefsN,
				361	const float* sP,
				362	const float* sN,
				363	const float* volumeLR,
				364	float lerpP,
				365	const float* coefsP1,
				366	const float* coefsN1)
				367	{
				368	ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
Glenn Kasten	91164e7	2016-03-15 15:55:01 -0700	[diff] [blame]	369	static_assert(CHANNELS == 1 \|\| CHANNELS == 2, "CHANNELS must be 1 or 2");
Andy Hung	6b667dd	2015-02-06 15:05:37 -0800	[diff] [blame]	370
				371	sP -= CHANNELS*((STRIDE>>1)-1);
				372	coefsP = (const float*)__builtin_assume_aligned(coefsP, 16);
				373	coefsN = (const float*)__builtin_assume_aligned(coefsN, 16);
				374
				375	float32x2_t interp;
				376	if (!FIXED) {
				377	interp = vdup_n_f32(lerpP);
				378	coefsP1 = (const float*)__builtin_assume_aligned(coefsP1, 16);
				379	coefsN1 = (const float*)__builtin_assume_aligned(coefsN1, 16);
				380	}
				381	float32x4_t accum, accum2;
				382	// warning uninitialized if we use veorq_s32
				383	// (alternative to below) accum = veorq_s32(accum, accum);
				384	accum = vdupq_n_f32(0);
				385	if (CHANNELS == 2) {
				386	// (alternative to below) accum2 = veorq_s32(accum2, accum2);
				387	accum2 = vdupq_n_f32(0);
				388	}
				389	do {
				390	#ifdef vld1q_f32_x2
				391	float32x4x2_t posCoef = vld1q_f32_x2(coefsP);
				392	coefsP += 8;
				393	float32x4x2_t negCoef = vld1q_f32_x2(coefsN);
				394	coefsN += 8;
				395	#else
				396	float32x4x2_t posCoef;
				397	posCoef.val[0] = vld1q_f32(coefsP);
				398	coefsP += 4;
				399	posCoef.val[1] = vld1q_f32(coefsP);
				400	coefsP += 4;
				401	float32x4x2_t negCoef;
				402	negCoef.val[0] = vld1q_f32(coefsN);
				403	coefsN += 4;
				404	negCoef.val[1] = vld1q_f32(coefsN);
				405	coefsN += 4;
				406	#endif
				407	if (!FIXED) { // interpolate
				408	#ifdef vld1q_f32_x2
				409	float32x4x2_t posCoef1 = vld1q_f32_x2(coefsP1);
				410	coefsP1 += 8;
				411	float32x4x2_t negCoef1 = vld1q_f32_x2(coefsN1);
				412	coefsN1 += 8;
				413	#else
				414	float32x4x2_t posCoef1;
				415	posCoef1.val[0] = vld1q_f32(coefsP1);
				416	coefsP1 += 4;
				417	posCoef1.val[1] = vld1q_f32(coefsP1);
				418	coefsP1 += 4;
				419	float32x4x2_t negCoef1;
				420	negCoef1.val[0] = vld1q_f32(coefsN1);
				421	coefsN1 += 4;
				422	negCoef1.val[1] = vld1q_f32(coefsN1);
				423	coefsN1 += 4;
				424	#endif
				425	posCoef1.val[0] = vsubq_f32(posCoef1.val[0], posCoef.val[0]);
				426	posCoef1.val[1] = vsubq_f32(posCoef1.val[1], posCoef.val[1]);
				427	negCoef.val[0] = vsubq_f32(negCoef.val[0], negCoef1.val[0]);
				428	negCoef.val[1] = vsubq_f32(negCoef.val[1], negCoef1.val[1]);
				429
				430	posCoef.val[0] = vmlaq_lane_f32(posCoef.val[0], posCoef1.val[0], interp, 0);
				431	posCoef.val[1] = vmlaq_lane_f32(posCoef.val[1], posCoef1.val[1], interp, 0);
				432	negCoef.val[0] = vmlaq_lane_f32(negCoef1.val[0], negCoef.val[0], interp, 0); // rev
				433	negCoef.val[1] = vmlaq_lane_f32(negCoef1.val[1], negCoef.val[1], interp, 0); // rev
				434	}
				435	switch (CHANNELS) {
				436	case 1: {
				437	#ifdef vld1q_f32_x2
				438	float32x4x2_t posSamp = vld1q_f32_x2(sP);
				439	float32x4x2_t negSamp = vld1q_f32_x2(sN);
				440	sN += 8;
				441	sP -= 8;
				442	#else
				443	float32x4x2_t posSamp;
				444	posSamp.val[0] = vld1q_f32(sP);
				445	sP += 4;
				446	posSamp.val[1] = vld1q_f32(sP);
				447	sP -= 12;
				448	float32x4x2_t negSamp;
				449	negSamp.val[0] = vld1q_f32(sN);
				450	sN += 4;
				451	negSamp.val[1] = vld1q_f32(sN);
				452	sN += 4;
				453	#endif
				454	// effectively we want a vrev128q_f32()
				455	posSamp.val[0] = vrev64q_f32(posSamp.val[0]);
				456	posSamp.val[1] = vrev64q_f32(posSamp.val[1]);
				457	posSamp.val[0] = vcombine_f32(
				458	vget_high_f32(posSamp.val[0]), vget_low_f32(posSamp.val[0]));
				459	posSamp.val[1] = vcombine_f32(
				460	vget_high_f32(posSamp.val[1]), vget_low_f32(posSamp.val[1]));
				461
				462	accum = vmlaq_f32(accum, posSamp.val[0], posCoef.val[1]);
				463	accum = vmlaq_f32(accum, posSamp.val[1], posCoef.val[0]);
				464	accum = vmlaq_f32(accum, negSamp.val[0], negCoef.val[0]);
				465	accum = vmlaq_f32(accum, negSamp.val[1], negCoef.val[1]);
				466	} break;
				467	case 2: {
				468	float32x4x2_t posSamp0 = vld2q_f32(sP);
				469	sP += 8;
				470	float32x4x2_t negSamp0 = vld2q_f32(sN);
				471	sN += 8;
				472	posSamp0.val[0] = vrev64q_f32(posSamp0.val[0]);
				473	posSamp0.val[1] = vrev64q_f32(posSamp0.val[1]);
				474	posSamp0.val[0] = vcombine_f32(
				475	vget_high_f32(posSamp0.val[0]), vget_low_f32(posSamp0.val[0]));
				476	posSamp0.val[1] = vcombine_f32(
				477	vget_high_f32(posSamp0.val[1]), vget_low_f32(posSamp0.val[1]));
				478
				479	float32x4x2_t posSamp1 = vld2q_f32(sP);
				480	sP -= 24;
				481	float32x4x2_t negSamp1 = vld2q_f32(sN);
				482	sN += 8;
				483	posSamp1.val[0] = vrev64q_f32(posSamp1.val[0]);
				484	posSamp1.val[1] = vrev64q_f32(posSamp1.val[1]);
				485	posSamp1.val[0] = vcombine_f32(
				486	vget_high_f32(posSamp1.val[0]), vget_low_f32(posSamp1.val[0]));
				487	posSamp1.val[1] = vcombine_f32(
				488	vget_high_f32(posSamp1.val[1]), vget_low_f32(posSamp1.val[1]));
				489
				490	// Note: speed is affected by accumulation order.
				491	// Also, speed appears slower using vmul/vadd instead of vmla for
				492	// stereo case, comparable for mono.
				493
				494	accum = vmlaq_f32(accum, negSamp0.val[0], negCoef.val[0]);
				495	accum = vmlaq_f32(accum, negSamp1.val[0], negCoef.val[1]);
				496	accum2 = vmlaq_f32(accum2, negSamp0.val[1], negCoef.val[0]);
				497	accum2 = vmlaq_f32(accum2, negSamp1.val[1], negCoef.val[1]);
				498
				499	accum = vmlaq_f32(accum, posSamp0.val[0], posCoef.val[1]); // reversed
				500	accum = vmlaq_f32(accum, posSamp1.val[0], posCoef.val[0]); // reversed
				501	accum2 = vmlaq_f32(accum2, posSamp0.val[1], posCoef.val[1]); // reversed
				502	accum2 = vmlaq_f32(accum2, posSamp1.val[1], posCoef.val[0]); // reversed
				503	} break;
				504	}
				505	} while (count -= 8);
				506
				507	// multiply by volume and save
				508	volumeLR = (const float*)__builtin_assume_aligned(volumeLR, 8);
				509	float32x2_t vLR = vld1_f32(volumeLR);
				510	float32x2_t outSamp = vld1_f32(out);
				511	// combine and funnel down accumulator
				512	float32x2_t outAccum = vpadd_f32(vget_low_f32(accum), vget_high_f32(accum));
				513	if (CHANNELS == 1) {
				514	// duplicate accum to both L and R
				515	outAccum = vpadd_f32(outAccum, outAccum);
				516	} else if (CHANNELS == 2) {
				517	// accum2 contains R, fold in
				518	float32x2_t outAccum2 = vpadd_f32(vget_low_f32(accum2), vget_high_f32(accum2));
				519	outAccum = vpadd_f32(outAccum, outAccum2);
				520	}
				521	outSamp = vmla_f32(outSamp, outAccum, vLR);
				522	vst1_f32(out, outSamp);
				523	}
				524
Andy Hung	86eae0e	2013-12-09 12:12:46 -0800	[diff] [blame]	525	template <>
				526	inline void ProcessL<1, 16>(int32_t* const out,
				527	int count,
				528	const int16_t* coefsP,
				529	const int16_t* coefsN,
				530	const int16_t* sP,
				531	const int16_t* sN,
				532	const int32_t* const volumeLR)
				533	{
Andy Hung	6b667dd	2015-02-06 15:05:37 -0800	[diff] [blame]	534	#ifdef USE_INTRINSIC
				535	ProcessNeonIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
				536	0 /lerpP/, NULL /coefsP1/, NULL /coefsN1/);
				537	#else
Andy Hung	86eae0e	2013-12-09 12:12:46 -0800	[diff] [blame]	538	const int CHANNELS = 1; // template specialization does not preserve params
				539	const int STRIDE = 16;
				540	sP -= CHANNELS*((STRIDE>>1)-1);
				541	asm (
				542	"veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0
				543
				544	"1: \n"
				545
				546	"vld1.16 {q2}, [%[sP]] \n"// (2+0d) load 8 16-bits mono samples
				547	"vld1.16 {q3}, [%[sN]]! \n"// (2) load 8 16-bits mono samples
				548	"vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs
				549	"vld1.16 {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
				550
				551	"vrev64.16 q2, q2 \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
				552
				553	// reordering the vmal to do d6, d7 before d4, d5 is slower(?)
				554	"vmlal.s16 q0, d4, d17 \n"// (1+0d) multiply (reversed)samples by coef
				555	"vmlal.s16 q0, d5, d16 \n"// (1) multiply (reversed)samples by coef
				556	"vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples
				557	"vmlal.s16 q0, d7, d21 \n"// (1) multiply neg samples
				558
				559	// moving these ARM instructions before neon above seems to be slower
				560	"subs %[count], %[count], #8 \n"// (1) update loop counter
				561	"sub %[sP], %[sP], #16 \n"// (0) move pointer to next set of samples
				562
				563	// sP used after branch (warning)
				564	"bne 1b \n"// loop
				565
				566	ASSEMBLY_ACCUMULATE_MONO
				567
				568	: [out] "=Uv" (out[0]),
				569	[count] "+r" (count),
				570	[coefsP0] "+r" (coefsP),
				571	[coefsN0] "+r" (coefsN),
				572	[sP] "+r" (sP),
				573	[sN] "+r" (sN)
				574	: [vLR] "r" (volumeLR)
				575	: "cc", "memory",
				576	"q0", "q1", "q2", "q3",
				577	"q8", "q10"
				578	);
Andy Hung	6b667dd	2015-02-06 15:05:37 -0800	[diff] [blame]	579	#endif
Andy Hung	86eae0e	2013-12-09 12:12:46 -0800	[diff] [blame]	580	}
				581
				582	template <>
				583	inline void ProcessL<2, 16>(int32_t* const out,
				584	int count,
				585	const int16_t* coefsP,
				586	const int16_t* coefsN,
				587	const int16_t* sP,
				588	const int16_t* sN,
				589	const int32_t* const volumeLR)
				590	{
Andy Hung	6b667dd	2015-02-06 15:05:37 -0800	[diff] [blame]	591	#ifdef USE_INTRINSIC
				592	ProcessNeonIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
				593	0 /lerpP/, NULL /coefsP1/, NULL /coefsN1/);
				594	#else
Andy Hung	86eae0e	2013-12-09 12:12:46 -0800	[diff] [blame]	595	const int CHANNELS = 2; // template specialization does not preserve params
				596	const int STRIDE = 16;
				597	sP -= CHANNELS*((STRIDE>>1)-1);
				598	asm (
				599	"veor q0, q0, q0 \n"// (1) acc_L = 0
				600	"veor q4, q4, q4 \n"// (0 combines+) acc_R = 0
				601
				602	"1: \n"
				603
Andy Hung	d7a7715	2015-02-06 14:58:38 -0800	[diff] [blame]	604	"vld2.16 {q2, q3}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo frames
				605	"vld2.16 {q5, q6}, [%[sN]]! \n"// (3) load 8 16-bits stereo frames
Andy Hung	86eae0e	2013-12-09 12:12:46 -0800	[diff] [blame]	606	"vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs
				607	"vld1.16 {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
				608
Andy Hung	d7a7715	2015-02-06 14:58:38 -0800	[diff] [blame]	609	"vrev64.16 q2, q2 \n"// (1) reverse 8 samples of positive left
				610	"vrev64.16 q3, q3 \n"// (0 combines+) reverse positive right
Andy Hung	86eae0e	2013-12-09 12:12:46 -0800	[diff] [blame]	611
				612	"vmlal.s16 q0, d4, d17 \n"// (1) multiply (reversed) samples left
				613	"vmlal.s16 q0, d5, d16 \n"// (1) multiply (reversed) samples left
				614	"vmlal.s16 q4, d6, d17 \n"// (1) multiply (reversed) samples right
				615	"vmlal.s16 q4, d7, d16 \n"// (1) multiply (reversed) samples right
				616	"vmlal.s16 q0, d10, d20 \n"// (1) multiply samples left
				617	"vmlal.s16 q0, d11, d21 \n"// (1) multiply samples left
				618	"vmlal.s16 q4, d12, d20 \n"// (1) multiply samples right
				619	"vmlal.s16 q4, d13, d21 \n"// (1) multiply samples right
				620
				621	// moving these ARM before neon seems to be slower
				622	"subs %[count], %[count], #8 \n"// (1) update loop counter
				623	"sub %[sP], %[sP], #32 \n"// (0) move pointer to next set of samples
				624
				625	// sP used after branch (warning)
				626	"bne 1b \n"// loop
				627
				628	ASSEMBLY_ACCUMULATE_STEREO
				629
				630	: [out] "=Uv" (out[0]),
				631	[count] "+r" (count),
				632	[coefsP0] "+r" (coefsP),
				633	[coefsN0] "+r" (coefsN),
				634	[sP] "+r" (sP),
				635	[sN] "+r" (sN)
				636	: [vLR] "r" (volumeLR)
				637	: "cc", "memory",
				638	"q0", "q1", "q2", "q3",
				639	"q4", "q5", "q6",
				640	"q8", "q10"
				641	);
Andy Hung	6b667dd	2015-02-06 15:05:37 -0800	[diff] [blame]	642	#endif
Andy Hung	86eae0e	2013-12-09 12:12:46 -0800	[diff] [blame]	643	}
				644
				645	template <>
				646	inline void Process<1, 16>(int32_t* const out,
				647	int count,
				648	const int16_t* coefsP,
				649	const int16_t* coefsN,
				650	const int16_t* coefsP1,
				651	const int16_t* coefsN1,
				652	const int16_t* sP,
				653	const int16_t* sN,
				654	uint32_t lerpP,
				655	const int32_t* const volumeLR)
				656	{
Andy Hung	6b667dd	2015-02-06 15:05:37 -0800	[diff] [blame]	657	#ifdef USE_INTRINSIC
				658	ProcessNeonIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
				659	lerpP, coefsP1, coefsN1);
				660	#else
				661
Andy Hung	86eae0e	2013-12-09 12:12:46 -0800	[diff] [blame]	662	const int CHANNELS = 1; // template specialization does not preserve params
				663	const int STRIDE = 16;
				664	sP -= CHANNELS*((STRIDE>>1)-1);
				665	asm (
				666	"vmov.32 d2[0], %[lerpP] \n"// load the positive phase S32 Q15
				667	"veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0
				668
				669	"1: \n"
				670
				671	"vld1.16 {q2}, [%[sP]] \n"// (2+0d) load 8 16-bits mono samples
				672	"vld1.16 {q3}, [%[sN]]! \n"// (2) load 8 16-bits mono samples
				673	"vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs
				674	"vld1.16 {q9}, [%[coefsP1]:128]! \n"// (1) load 8 16-bits coefs for interpolation
				675	"vld1.16 {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
				676	"vld1.16 {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation
				677
				678	"vsub.s16 q9, q9, q8 \n"// (1) interpolate (step1) 1st set of coefs
				679	"vsub.s16 q11, q11, q10 \n"// (1) interpolate (step1) 2nd set of coets
				680
				681	"vqrdmulh.s16 q9, q9, d2[0] \n"// (2) interpolate (step2) 1st set of coefs
				682	"vqrdmulh.s16 q11, q11, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs
				683
				684	"vrev64.16 q2, q2 \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
				685
				686	"vadd.s16 q8, q8, q9 \n"// (1+2d) interpolate (step3) 1st set
				687	"vadd.s16 q10, q10, q11 \n"// (1+1d) interpolate (step3) 2nd set
				688
				689	// reordering the vmal to do d6, d7 before d4, d5 is slower(?)
				690	"vmlal.s16 q0, d4, d17 \n"// (1+0d) multiply reversed samples by coef
				691	"vmlal.s16 q0, d5, d16 \n"// (1) multiply reversed samples by coef
				692	"vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples
				693	"vmlal.s16 q0, d7, d21 \n"// (1) multiply neg samples
				694
				695	// moving these ARM instructions before neon above seems to be slower
				696	"subs %[count], %[count], #8 \n"// (1) update loop counter
				697	"sub %[sP], %[sP], #16 \n"// (0) move pointer to next set of samples
				698
				699	// sP used after branch (warning)
				700	"bne 1b \n"// loop
				701
				702	ASSEMBLY_ACCUMULATE_MONO
				703
				704	: [out] "=Uv" (out[0]),
				705	[count] "+r" (count),
				706	[coefsP0] "+r" (coefsP),
				707	[coefsN0] "+r" (coefsN),
				708	[coefsP1] "+r" (coefsP1),
				709	[coefsN1] "+r" (coefsN1),
				710	[sP] "+r" (sP),
				711	[sN] "+r" (sN)
				712	: [lerpP] "r" (lerpP),
				713	[vLR] "r" (volumeLR)
				714	: "cc", "memory",
				715	"q0", "q1", "q2", "q3",
				716	"q8", "q9", "q10", "q11"
				717	);
Andy Hung	6b667dd	2015-02-06 15:05:37 -0800	[diff] [blame]	718	#endif
Andy Hung	86eae0e	2013-12-09 12:12:46 -0800	[diff] [blame]	719	}
				720
				721	template <>
				722	inline void Process<2, 16>(int32_t* const out,
				723	int count,
				724	const int16_t* coefsP,
				725	const int16_t* coefsN,
				726	const int16_t* coefsP1,
				727	const int16_t* coefsN1,
				728	const int16_t* sP,
				729	const int16_t* sN,
				730	uint32_t lerpP,
				731	const int32_t* const volumeLR)
				732	{
Andy Hung	6b667dd	2015-02-06 15:05:37 -0800	[diff] [blame]	733	#ifdef USE_INTRINSIC
				734	ProcessNeonIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
				735	lerpP, coefsP1, coefsN1);
				736	#else
Andy Hung	86eae0e	2013-12-09 12:12:46 -0800	[diff] [blame]	737	const int CHANNELS = 2; // template specialization does not preserve params
				738	const int STRIDE = 16;
				739	sP -= CHANNELS*((STRIDE>>1)-1);
				740	asm (
				741	"vmov.32 d2[0], %[lerpP] \n"// load the positive phase
				742	"veor q0, q0, q0 \n"// (1) acc_L = 0
				743	"veor q4, q4, q4 \n"// (0 combines+) acc_R = 0
				744
				745	"1: \n"
				746
Andy Hung	d7a7715	2015-02-06 14:58:38 -0800	[diff] [blame]	747	"vld2.16 {q2, q3}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo frames
				748	"vld2.16 {q5, q6}, [%[sN]]! \n"// (3) load 8 16-bits stereo frames
Andy Hung	86eae0e	2013-12-09 12:12:46 -0800	[diff] [blame]	749	"vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs
				750	"vld1.16 {q9}, [%[coefsP1]:128]! \n"// (1) load 8 16-bits coefs for interpolation
				751	"vld1.16 {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
				752	"vld1.16 {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation
				753
				754	"vsub.s16 q9, q9, q8 \n"// (1) interpolate (step1) 1st set of coefs
				755	"vsub.s16 q11, q11, q10 \n"// (1) interpolate (step1) 2nd set of coets
				756
				757	"vqrdmulh.s16 q9, q9, d2[0] \n"// (2) interpolate (step2) 1st set of coefs
				758	"vqrdmulh.s16 q11, q11, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs
				759
Andy Hung	d7a7715	2015-02-06 14:58:38 -0800	[diff] [blame]	760	"vrev64.16 q2, q2 \n"// (1) reverse 8 samples of positive left
				761	"vrev64.16 q3, q3 \n"// (1) reverse 8 samples of positive right
Andy Hung	86eae0e	2013-12-09 12:12:46 -0800	[diff] [blame]	762
				763	"vadd.s16 q8, q8, q9 \n"// (1+1d) interpolate (step3) 1st set
				764	"vadd.s16 q10, q10, q11 \n"// (1+1d) interpolate (step3) 2nd set
				765
				766	"vmlal.s16 q0, d4, d17 \n"// (1) multiply reversed samples left
				767	"vmlal.s16 q0, d5, d16 \n"// (1) multiply reversed samples left
				768	"vmlal.s16 q4, d6, d17 \n"// (1) multiply reversed samples right
				769	"vmlal.s16 q4, d7, d16 \n"// (1) multiply reversed samples right
				770	"vmlal.s16 q0, d10, d20 \n"// (1) multiply samples left
				771	"vmlal.s16 q0, d11, d21 \n"// (1) multiply samples left
				772	"vmlal.s16 q4, d12, d20 \n"// (1) multiply samples right
				773	"vmlal.s16 q4, d13, d21 \n"// (1) multiply samples right
				774
				775	// moving these ARM before neon seems to be slower
				776	"subs %[count], %[count], #8 \n"// (1) update loop counter
				777	"sub %[sP], %[sP], #32 \n"// (0) move pointer to next set of samples
				778
				779	// sP used after branch (warning)
				780	"bne 1b \n"// loop
				781
				782	ASSEMBLY_ACCUMULATE_STEREO
				783
				784	: [out] "=Uv" (out[0]),
				785	[count] "+r" (count),
				786	[coefsP0] "+r" (coefsP),
				787	[coefsN0] "+r" (coefsN),
				788	[coefsP1] "+r" (coefsP1),
				789	[coefsN1] "+r" (coefsN1),
				790	[sP] "+r" (sP),
				791	[sN] "+r" (sN)
				792	: [lerpP] "r" (lerpP),
				793	[vLR] "r" (volumeLR)
				794	: "cc", "memory",
				795	"q0", "q1", "q2", "q3",
				796	"q4", "q5", "q6",
				797	"q8", "q9", "q10", "q11"
				798	);
Andy Hung	6b667dd	2015-02-06 15:05:37 -0800	[diff] [blame]	799	#endif
Andy Hung	86eae0e	2013-12-09 12:12:46 -0800	[diff] [blame]	800	}
				801
				802	template <>
				803	inline void ProcessL<1, 16>(int32_t* const out,
				804	int count,
				805	const int32_t* coefsP,
				806	const int32_t* coefsN,
				807	const int16_t* sP,
				808	const int16_t* sN,
				809	const int32_t* const volumeLR)
				810	{
Andy Hung	6b667dd	2015-02-06 15:05:37 -0800	[diff] [blame]	811	#ifdef USE_INTRINSIC
				812	ProcessNeonIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
				813	0 /lerpP/, NULL /coefsP1/, NULL /coefsN1/);
				814	#else
Andy Hung	86eae0e	2013-12-09 12:12:46 -0800	[diff] [blame]	815	const int CHANNELS = 1; // template specialization does not preserve params
				816	const int STRIDE = 16;
				817	sP -= CHANNELS*((STRIDE>>1)-1);
				818	asm (
				819	"veor q0, q0, q0 \n"// result, initialize to 0
				820
				821	"1: \n"
				822
				823	"vld1.16 {q2}, [%[sP]] \n"// load 8 16-bits mono samples
				824	"vld1.16 {q3}, [%[sN]]! \n"// load 8 16-bits mono samples
				825	"vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs
				826	"vld1.32 {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
				827
Andy Hung	d7a7715	2015-02-06 14:58:38 -0800	[diff] [blame]	828	"vrev64.16 q2, q2 \n"// reverse 8 samples of the positive side
Andy Hung	86eae0e	2013-12-09 12:12:46 -0800	[diff] [blame]	829
				830	"vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
				831	"vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
				832
				833	"vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits
				834	"vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits
				835
Andy Hung	d7a7715	2015-02-06 14:58:38 -0800	[diff] [blame]	836	"vqrdmulh.s32 q12, q12, q9 \n"// multiply samples
				837	"vqrdmulh.s32 q13, q13, q8 \n"// multiply samples
				838	"vqrdmulh.s32 q14, q14, q10 \n"// multiply samples
				839	"vqrdmulh.s32 q15, q15, q11 \n"// multiply samples
Andy Hung	86eae0e	2013-12-09 12:12:46 -0800	[diff] [blame]	840
				841	"vadd.s32 q0, q0, q12 \n"// accumulate result
				842	"vadd.s32 q13, q13, q14 \n"// accumulate result
				843	"vadd.s32 q0, q0, q15 \n"// accumulate result
				844	"vadd.s32 q0, q0, q13 \n"// accumulate result
				845
				846	"sub %[sP], %[sP], #16 \n"// move pointer to next set of samples
				847	"subs %[count], %[count], #8 \n"// update loop counter
				848
				849	"bne 1b \n"// loop
				850
				851	ASSEMBLY_ACCUMULATE_MONO
				852
				853	: [out] "=Uv" (out[0]),
				854	[count] "+r" (count),
				855	[coefsP0] "+r" (coefsP),
				856	[coefsN0] "+r" (coefsN),
				857	[sP] "+r" (sP),
				858	[sN] "+r" (sN)
				859	: [vLR] "r" (volumeLR)
				860	: "cc", "memory",
				861	"q0", "q1", "q2", "q3",
				862	"q8", "q9", "q10", "q11",
				863	"q12", "q13", "q14", "q15"
				864	);
Andy Hung	6b667dd	2015-02-06 15:05:37 -0800	[diff] [blame]	865	#endif
Andy Hung	86eae0e	2013-12-09 12:12:46 -0800	[diff] [blame]	866	}
				867
				868	template <>
				869	inline void ProcessL<2, 16>(int32_t* const out,
				870	int count,
				871	const int32_t* coefsP,
				872	const int32_t* coefsN,
				873	const int16_t* sP,
				874	const int16_t* sN,
				875	const int32_t* const volumeLR)
				876	{
Andy Hung	6b667dd	2015-02-06 15:05:37 -0800	[diff] [blame]	877	#ifdef USE_INTRINSIC
				878	ProcessNeonIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
				879	0 /lerpP/, NULL /coefsP1/, NULL /coefsN1/);
				880	#else
Andy Hung	86eae0e	2013-12-09 12:12:46 -0800	[diff] [blame]	881	const int CHANNELS = 2; // template specialization does not preserve params
				882	const int STRIDE = 16;
				883	sP -= CHANNELS*((STRIDE>>1)-1);
				884	asm (
				885	"veor q0, q0, q0 \n"// result, initialize to 0
				886	"veor q4, q4, q4 \n"// result, initialize to 0
				887
				888	"1: \n"
				889
Andy Hung	d7a7715	2015-02-06 14:58:38 -0800	[diff] [blame]	890	"vld2.16 {q2, q3}, [%[sP]] \n"// load 8 16-bits stereo frames
				891	"vld2.16 {q5, q6}, [%[sN]]! \n"// load 8 16-bits stereo frames
				892	"vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs
				893	"vld1.32 {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
Andy Hung	86eae0e	2013-12-09 12:12:46 -0800	[diff] [blame]	894
Andy Hung	d7a7715	2015-02-06 14:58:38 -0800	[diff] [blame]	895	"vrev64.16 q2, q2 \n"// reverse 8 samples of positive left
				896	"vrev64.16 q3, q3 \n"// reverse 8 samples of positive right
Andy Hung	86eae0e	2013-12-09 12:12:46 -0800	[diff] [blame]	897
				898	"vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
				899	"vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
				900
				901	"vshll.s16 q14, d10, #15 \n"// extend samples to 31 bits
				902	"vshll.s16 q15, d11, #15 \n"// extend samples to 31 bits
				903
Andy Hung	d7a7715	2015-02-06 14:58:38 -0800	[diff] [blame]	904	"vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by coef
				905	"vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by coef
				906	"vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by coef
				907	"vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by coef
Andy Hung	86eae0e	2013-12-09 12:12:46 -0800	[diff] [blame]	908
				909	"vadd.s32 q0, q0, q12 \n"// accumulate result
				910	"vadd.s32 q13, q13, q14 \n"// accumulate result
Andy Hung	d7a7715	2015-02-06 14:58:38 -0800	[diff] [blame]	911	"vadd.s32 q0, q0, q15 \n"// accumulate result
				912	"vadd.s32 q0, q0, q13 \n"// accumulate result
Andy Hung	86eae0e	2013-12-09 12:12:46 -0800	[diff] [blame]	913
				914	"vshll.s16 q12, d6, #15 \n"// extend samples to 31 bits
				915	"vshll.s16 q13, d7, #15 \n"// extend samples to 31 bits
				916
				917	"vshll.s16 q14, d12, #15 \n"// extend samples to 31 bits
				918	"vshll.s16 q15, d13, #15 \n"// extend samples to 31 bits
				919
Andy Hung	d7a7715	2015-02-06 14:58:38 -0800	[diff] [blame]	920	"vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by coef
				921	"vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by coef
				922	"vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by coef
				923	"vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by coef
Andy Hung	86eae0e	2013-12-09 12:12:46 -0800	[diff] [blame]	924
				925	"vadd.s32 q4, q4, q12 \n"// accumulate result
				926	"vadd.s32 q13, q13, q14 \n"// accumulate result
Andy Hung	d7a7715	2015-02-06 14:58:38 -0800	[diff] [blame]	927	"vadd.s32 q4, q4, q15 \n"// accumulate result
				928	"vadd.s32 q4, q4, q13 \n"// accumulate result
Andy Hung	86eae0e	2013-12-09 12:12:46 -0800	[diff] [blame]	929
				930	"subs %[count], %[count], #8 \n"// update loop counter
				931	"sub %[sP], %[sP], #32 \n"// move pointer to next set of samples
				932
				933	"bne 1b \n"// loop
				934
				935	ASSEMBLY_ACCUMULATE_STEREO
				936
				937	: [out] "=Uv" (out[0]),
				938	[count] "+r" (count),
				939	[coefsP0] "+r" (coefsP),
				940	[coefsN0] "+r" (coefsN),
				941	[sP] "+r" (sP),
				942	[sN] "+r" (sN)
				943	: [vLR] "r" (volumeLR)
				944	: "cc", "memory",
				945	"q0", "q1", "q2", "q3",
				946	"q4", "q5", "q6",
				947	"q8", "q9", "q10", "q11",
				948	"q12", "q13", "q14", "q15"
				949	);
Andy Hung	6b667dd	2015-02-06 15:05:37 -0800	[diff] [blame]	950	#endif
Andy Hung	86eae0e	2013-12-09 12:12:46 -0800	[diff] [blame]	951	}
				952
				953	template <>
				954	inline void Process<1, 16>(int32_t* const out,
				955	int count,
				956	const int32_t* coefsP,
				957	const int32_t* coefsN,
				958	const int32_t* coefsP1,
				959	const int32_t* coefsN1,
				960	const int16_t* sP,
				961	const int16_t* sN,
				962	uint32_t lerpP,
				963	const int32_t* const volumeLR)
				964	{
Andy Hung	6b667dd	2015-02-06 15:05:37 -0800	[diff] [blame]	965	#ifdef USE_INTRINSIC
				966	ProcessNeonIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
				967	lerpP, coefsP1, coefsN1);
				968	#else
Andy Hung	86eae0e	2013-12-09 12:12:46 -0800	[diff] [blame]	969	const int CHANNELS = 1; // template specialization does not preserve params
				970	const int STRIDE = 16;
				971	sP -= CHANNELS*((STRIDE>>1)-1);
				972	asm (
				973	"vmov.32 d2[0], %[lerpP] \n"// load the positive phase
				974	"veor q0, q0, q0 \n"// result, initialize to 0
				975
				976	"1: \n"
				977
				978	"vld1.16 {q2}, [%[sP]] \n"// load 8 16-bits mono samples
				979	"vld1.16 {q3}, [%[sN]]! \n"// load 8 16-bits mono samples
				980	"vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs
				981	"vld1.32 {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
				982	"vld1.32 {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
				983	"vld1.32 {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
				984
				985	"vsub.s32 q12, q12, q8 \n"// interpolate (step1)
				986	"vsub.s32 q13, q13, q9 \n"// interpolate (step1)
				987	"vsub.s32 q14, q14, q10 \n"// interpolate (step1)
				988	"vsub.s32 q15, q15, q11 \n"// interpolate (step1)
				989
				990	"vqrdmulh.s32 q12, q12, d2[0] \n"// interpolate (step2)
				991	"vqrdmulh.s32 q13, q13, d2[0] \n"// interpolate (step2)
				992	"vqrdmulh.s32 q14, q14, d2[0] \n"// interpolate (step2)
				993	"vqrdmulh.s32 q15, q15, d2[0] \n"// interpolate (step2)
				994
				995	"vadd.s32 q8, q8, q12 \n"// interpolate (step3)
				996	"vadd.s32 q9, q9, q13 \n"// interpolate (step3)
				997	"vadd.s32 q10, q10, q14 \n"// interpolate (step3)
				998	"vadd.s32 q11, q11, q15 \n"// interpolate (step3)
				999
Andy Hung	d7a7715	2015-02-06 14:58:38 -0800	[diff] [blame]	1000	"vrev64.16 q2, q2 \n"// reverse 8 samples of the positive side
Andy Hung	86eae0e	2013-12-09 12:12:46 -0800	[diff] [blame]	1001
				1002	"vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
				1003	"vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
				1004
				1005	"vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits
				1006	"vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits
				1007
				1008	"vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef
				1009	"vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef
				1010	"vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
				1011	"vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef
				1012
				1013	"vadd.s32 q0, q0, q12 \n"// accumulate result
				1014	"vadd.s32 q13, q13, q14 \n"// accumulate result
				1015	"vadd.s32 q0, q0, q15 \n"// accumulate result
				1016	"vadd.s32 q0, q0, q13 \n"// accumulate result
				1017
				1018	"sub %[sP], %[sP], #16 \n"// move pointer to next set of samples
				1019	"subs %[count], %[count], #8 \n"// update loop counter
				1020
				1021	"bne 1b \n"// loop
				1022
				1023	ASSEMBLY_ACCUMULATE_MONO
				1024
				1025	: [out] "=Uv" (out[0]),
				1026	[count] "+r" (count),
				1027	[coefsP0] "+r" (coefsP),
				1028	[coefsN0] "+r" (coefsN),
				1029	[coefsP1] "+r" (coefsP1),
				1030	[coefsN1] "+r" (coefsN1),
				1031	[sP] "+r" (sP),
				1032	[sN] "+r" (sN)
				1033	: [lerpP] "r" (lerpP),
				1034	[vLR] "r" (volumeLR)
				1035	: "cc", "memory",
				1036	"q0", "q1", "q2", "q3",
				1037	"q8", "q9", "q10", "q11",
				1038	"q12", "q13", "q14", "q15"
				1039	);
Andy Hung	6b667dd	2015-02-06 15:05:37 -0800	[diff] [blame]	1040	#endif
Andy Hung	86eae0e	2013-12-09 12:12:46 -0800	[diff] [blame]	1041	}
				1042
				1043	template <>
				1044	inline void Process<2, 16>(int32_t* const out,
				1045	int count,
				1046	const int32_t* coefsP,
				1047	const int32_t* coefsN,
				1048	const int32_t* coefsP1,
				1049	const int32_t* coefsN1,
				1050	const int16_t* sP,
				1051	const int16_t* sN,
				1052	uint32_t lerpP,
				1053	const int32_t* const volumeLR)
				1054	{
Andy Hung	6b667dd	2015-02-06 15:05:37 -0800	[diff] [blame]	1055	#ifdef USE_INTRINSIC
				1056	ProcessNeonIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
				1057	lerpP, coefsP1, coefsN1);
				1058	#else
Andy Hung	86eae0e	2013-12-09 12:12:46 -0800	[diff] [blame]	1059	const int CHANNELS = 2; // template specialization does not preserve params
				1060	const int STRIDE = 16;
				1061	sP -= CHANNELS*((STRIDE>>1)-1);
				1062	asm (
				1063	"vmov.32 d2[0], %[lerpP] \n"// load the positive phase
				1064	"veor q0, q0, q0 \n"// result, initialize to 0
				1065	"veor q4, q4, q4 \n"// result, initialize to 0
				1066
				1067	"1: \n"
				1068
Andy Hung	d7a7715	2015-02-06 14:58:38 -0800	[diff] [blame]	1069	"vld2.16 {q2, q3}, [%[sP]] \n"// load 8 16-bits stereo frames
				1070	"vld2.16 {q5, q6}, [%[sN]]! \n"// load 8 16-bits stereo frames
Andy Hung	86eae0e	2013-12-09 12:12:46 -0800	[diff] [blame]	1071	"vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs
				1072	"vld1.32 {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
				1073	"vld1.32 {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
				1074	"vld1.32 {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
				1075
				1076	"vsub.s32 q12, q12, q8 \n"// interpolate (step1)
				1077	"vsub.s32 q13, q13, q9 \n"// interpolate (step1)
				1078	"vsub.s32 q14, q14, q10 \n"// interpolate (step1)
				1079	"vsub.s32 q15, q15, q11 \n"// interpolate (step1)
				1080
				1081	"vqrdmulh.s32 q12, q12, d2[0] \n"// interpolate (step2)
				1082	"vqrdmulh.s32 q13, q13, d2[0] \n"// interpolate (step2)
				1083	"vqrdmulh.s32 q14, q14, d2[0] \n"// interpolate (step2)
				1084	"vqrdmulh.s32 q15, q15, d2[0] \n"// interpolate (step2)
				1085
				1086	"vadd.s32 q8, q8, q12 \n"// interpolate (step3)
				1087	"vadd.s32 q9, q9, q13 \n"// interpolate (step3)
				1088	"vadd.s32 q10, q10, q14 \n"// interpolate (step3)
				1089	"vadd.s32 q11, q11, q15 \n"// interpolate (step3)
				1090
Andy Hung	d7a7715	2015-02-06 14:58:38 -0800	[diff] [blame]	1091	"vrev64.16 q2, q2 \n"// reverse 8 samples of positive left
				1092	"vrev64.16 q3, q3 \n"// reverse 8 samples of positive right
Andy Hung	86eae0e	2013-12-09 12:12:46 -0800	[diff] [blame]	1093
				1094	"vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
				1095	"vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
				1096
				1097	"vshll.s16 q14, d10, #15 \n"// extend samples to 31 bits
				1098	"vshll.s16 q15, d11, #15 \n"// extend samples to 31 bits
				1099
				1100	"vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef
				1101	"vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef
				1102	"vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
				1103	"vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef
				1104
				1105	"vadd.s32 q0, q0, q12 \n"// accumulate result
				1106	"vadd.s32 q13, q13, q14 \n"// accumulate result
Andy Hung	d7a7715	2015-02-06 14:58:38 -0800	[diff] [blame]	1107	"vadd.s32 q0, q0, q15 \n"// accumulate result
				1108	"vadd.s32 q0, q0, q13 \n"// accumulate result
Andy Hung	86eae0e	2013-12-09 12:12:46 -0800	[diff] [blame]	1109
				1110	"vshll.s16 q12, d6, #15 \n"// extend samples to 31 bits
				1111	"vshll.s16 q13, d7, #15 \n"// extend samples to 31 bits
				1112
				1113	"vshll.s16 q14, d12, #15 \n"// extend samples to 31 bits
				1114	"vshll.s16 q15, d13, #15 \n"// extend samples to 31 bits
				1115
				1116	"vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef
				1117	"vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef
				1118	"vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
				1119	"vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef
				1120
				1121	"vadd.s32 q4, q4, q12 \n"// accumulate result
				1122	"vadd.s32 q13, q13, q14 \n"// accumulate result
Andy Hung	d7a7715	2015-02-06 14:58:38 -0800	[diff] [blame]	1123	"vadd.s32 q4, q4, q15 \n"// accumulate result
				1124	"vadd.s32 q4, q4, q13 \n"// accumulate result
Andy Hung	86eae0e	2013-12-09 12:12:46 -0800	[diff] [blame]	1125
				1126	"subs %[count], %[count], #8 \n"// update loop counter
				1127	"sub %[sP], %[sP], #32 \n"// move pointer to next set of samples
				1128
				1129	"bne 1b \n"// loop
				1130
				1131	ASSEMBLY_ACCUMULATE_STEREO
				1132
				1133	: [out] "=Uv" (out[0]),
				1134	[count] "+r" (count),
				1135	[coefsP0] "+r" (coefsP),
				1136	[coefsN0] "+r" (coefsN),
				1137	[coefsP1] "+r" (coefsP1),
				1138	[coefsN1] "+r" (coefsN1),
				1139	[sP] "+r" (sP),
				1140	[sN] "+r" (sN)
				1141	: [lerpP] "r" (lerpP),
				1142	[vLR] "r" (volumeLR)
				1143	: "cc", "memory",
				1144	"q0", "q1", "q2", "q3",
				1145	"q4", "q5", "q6",
				1146	"q8", "q9", "q10", "q11",
				1147	"q12", "q13", "q14", "q15"
				1148	);
Andy Hung	6b667dd	2015-02-06 15:05:37 -0800	[diff] [blame]	1149	#endif
				1150	}
				1151
				1152	template<>
				1153	inline void ProcessL<1, 16>(float* const out,
				1154	int count,
				1155	const float* coefsP,
				1156	const float* coefsN,
				1157	const float* sP,
				1158	const float* sN,
				1159	const float* const volumeLR)
				1160	{
				1161	ProcessNeonIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
				1162	0 /lerpP/, NULL /coefsP1/, NULL /coefsN1/);
				1163	}
				1164
				1165	template<>
				1166	inline void ProcessL<2, 16>(float* const out,
				1167	int count,
				1168	const float* coefsP,
				1169	const float* coefsN,
				1170	const float* sP,
				1171	const float* sN,
				1172	const float* const volumeLR)
				1173	{
				1174	ProcessNeonIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
				1175	0 /lerpP/, NULL /coefsP1/, NULL /coefsN1/);
				1176	}
				1177
				1178	template<>
				1179	inline void Process<1, 16>(float* const out,
				1180	int count,
				1181	const float* coefsP,
				1182	const float* coefsN,
				1183	const float* coefsP1,
				1184	const float* coefsN1,
				1185	const float* sP,
				1186	const float* sN,
				1187	float lerpP,
				1188	const float* const volumeLR)
				1189	{
				1190	ProcessNeonIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
				1191	lerpP, coefsP1, coefsN1);
				1192	}
				1193
				1194	template<>
				1195	inline void Process<2, 16>(float* const out,
				1196	int count,
				1197	const float* coefsP,
				1198	const float* coefsN,
				1199	const float* coefsP1,
				1200	const float* coefsN1,
				1201	const float* sP,
				1202	const float* sN,
				1203	float lerpP,
				1204	const float* const volumeLR)
				1205	{
				1206	ProcessNeonIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
				1207	lerpP, coefsP1, coefsN1);
Andy Hung	86eae0e	2013-12-09 12:12:46 -0800	[diff] [blame]	1208	}
				1209
Andy Hung	86eae0e	2013-12-09 12:12:46 -0800	[diff] [blame]	1210	#endif //USE_NEON
				1211
Glenn Kasten	63238ef	2015-03-02 15:50:29 -0800	[diff] [blame]	1212	} // namespace android
Andy Hung	86eae0e	2013-12-09 12:12:46 -0800	[diff] [blame]	1213
				1214	#endif /ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H/