Blame - libc/arch-arm/bionic/memmove.S - android_bionic

blob: 937d14bfe73fd93a77557a8909085d566b6bac50 [file] [log] [blame]

Brent DeGraaf	a8c0221	2012-05-30 22:50:19 -0400	[diff] [blame^]	1	/***************************************************************************
				2	Copyright (c) 2009-2012 Code Aurora Forum. All rights reserved.
				3
				4	Redistribution and use in source and binary forms, with or without
				5	modification, are permitted provided that the following conditions are met:
				6	* Redistributions of source code must retain the above copyright
				7	notice, this list of conditions and the following disclaimer.
				8	* Redistributions in binary form must reproduce the above copyright
				9	notice, this list of conditions and the following disclaimer in the
				10	documentation and/or other materials provided with the distribution.
				11	* Neither the name of Code Aurora nor the names of its contributors may
				12	be used to endorse or promote products derived from this software
				13	without specific prior written permission.
				14
				15	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
				16	AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
				17	IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
				18	ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
				19	LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
				20	CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
				21	SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
				22	INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
				23	CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
				24	ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
				25	POSSIBILITY OF SUCH DAMAGE.
				26	***************************************************************************/
				27
				28	/***************************************************************************
				29	* Neon memmove: Attempts to do a memmove with Neon registers if possible,
				30	* Inputs:
				31	* dest: The destination buffer
				32	* src: The source buffer
				33	* n: The size of the buffer to transfer
				34	* Outputs:
				35	*
				36	***************************************************************************/
				37
				38	#include <machine/cpu-features.h>
				39
				40	#if defined(KRAIT_NEON_OPTIMIZATION) \|\| defined(SPARROW_NEON_OPTIMIZATION)
				41	/*
				42	* These can be overridden in:
				43	* device/<vendor>/<board>/BoardConfig.mk
				44	* by setting the following:
				45	* TARGET_USE_KRAIT_BIONIC_OPTIMIZATION := true
				46	* TARGET_USE_KRAIT_PLD_SET := true
				47	* TARGET_KRAIT_BIONIC_PLDOFFS := <pldoffset>
				48	* TARGET_KRAIT_BIONIC_PLDSIZE := <pldsize>
				49	* TARGET_KRAIT_BIONIC_PLDTHRESH := <pldthreshold>
				50	*/
				51	#ifndef PLDOFFS
				52	#define PLDOFFS (10)
				53	#endif
				54	#ifndef PLDTHRESH
				55	#define PLDTHRESH (PLDOFFS)
				56	#endif
				57	#if (PLDOFFS < 5)
				58	#error Routine does not support offsets less than 5
				59	#endif
				60	#if (PLDTHRESH < PLDOFFS)
				61	#error PLD threshold must be greater than or equal to the PLD offset
				62	#endif
				63	#ifndef PLDSIZE
				64	#define PLDSIZE (64)
				65	#endif
				66	#define NOP_OPCODE (0xe320f000)
				67
				68	.code 32
				69	.align 5
				70	.global memmove
				71	.type memmove, %function
				72
				73	.global _memmove_words
				74	.type _memmove_words, %function
				75
				76	.global bcopy
				77	.type bcopy, %function
				78
				79	bcopy:
				80	mov r12, r0
				81	mov r0, r1
				82	mov r1, r12
				83	.balignl 64, NOP_OPCODE, 4*2
				84	memmove:
				85	_memmove_words:
				86	.Lneon_memmove_cmf:
				87	subs r12, r0, r1
				88	bxeq lr
				89	cmphi r2, r12
				90	bls memcpy /* Use memcpy for non-overlapping areas */
				91
				92	push {r0}
				93
				94	.Lneon_back_to_front_copy:
				95	add r0, r0, r2
				96	add r1, r1, r2
				97	cmp r2, #4
				98	bgt .Lneon_b2f_gt4
				99	cmp r2, #0
				100	.Lneon_b2f_smallcopy_loop:
				101	beq .Lneon_memmove_done
				102	ldrb r12, [r1, #-1]!
				103	subs r2, r2, #1
				104	strb r12, [r0, #-1]!
				105	b .Lneon_b2f_smallcopy_loop
				106	.Lneon_b2f_gt4:
				107	sub r3, r0, r1
				108	cmp r2, r3
				109	movle r12, r2
				110	movgt r12, r3
				111	cmp r12, #64
				112	bge .Lneon_b2f_copy_64
				113	cmp r12, #32
				114	bge .Lneon_b2f_copy_32
				115	cmp r12, #8
				116	bge .Lneon_b2f_copy_8
				117	cmp r12, #4
				118	bge .Lneon_b2f_copy_4
				119	b .Lneon_b2f_copy_1
				120	.Lneon_b2f_copy_64:
				121	sub r1, r1, #64 /* Predecrement */
				122	sub r0, r0, #64
				123	movs r12, r2, lsr #6
				124	cmp r12, #PLDTHRESH
				125	ble .Lneon_b2f_copy_64_loop_nopld
				126	sub r12, #PLDOFFS
				127	pld [r1, #-(PLDOFFS-5)*PLDSIZE]
				128	pld [r1, #-(PLDOFFS-4)*PLDSIZE]
				129	pld [r1, #-(PLDOFFS-3)*PLDSIZE]
				130	pld [r1, #-(PLDOFFS-2)*PLDSIZE]
				131	pld [r1, #-(PLDOFFS-1)*PLDSIZE]
				132	.balignl 64, NOP_OPCODE, 4*2
				133	.Lneon_b2f_copy_64_loop_outer:
				134	pld [r1, #-(PLDOFFS)*PLDSIZE]
				135	vld1.32 {q0, q1}, [r1]!
				136	vld1.32 {q2, q3}, [r1]
				137	subs r12, r12, #1
				138	vst1.32 {q0, q1}, [r0]!
				139	sub r1, r1, #96 /* Post-fixup and predecrement */
				140	vst1.32 {q2, q3}, [r0]
				141	sub r0, r0, #96
				142	bne .Lneon_b2f_copy_64_loop_outer
				143	mov r12, #PLDOFFS
				144	.balignl 64, NOP_OPCODE, 4*2
				145	.Lneon_b2f_copy_64_loop_nopld:
				146	vld1.32 {q8, q9}, [r1]!
				147	vld1.32 {q10, q11}, [r1]
				148	subs r12, r12, #1
				149	vst1.32 {q8, q9}, [r0]!
				150	sub r1, r1, #96 /* Post-fixup and predecrement */
				151	vst1.32 {q10, q11}, [r0]
				152	sub r0, r0, #96
				153	bne .Lneon_b2f_copy_64_loop_nopld
				154	ands r2, r2, #0x3f
				155	beq .Lneon_memmove_done
				156	add r1, r1, #64 /* Post-fixup */
				157	add r0, r0, #64
				158	cmp r2, #32
				159	blt .Lneon_b2f_copy_finish
				160	.Lneon_b2f_copy_32:
				161	mov r12, r2, lsr #5
				162	.Lneon_b2f_copy_32_loop:
				163	sub r1, r1, #32 /* Predecrement */
				164	sub r0, r0, #32
				165	vld1.32 {q0,q1}, [r1]
				166	subs r12, r12, #1
				167	vst1.32 {q0,q1}, [r0]
				168	bne .Lneon_b2f_copy_32_loop
				169	ands r2, r2, #0x1f
				170	beq .Lneon_memmove_done
				171	.Lneon_b2f_copy_finish:
				172	.Lneon_b2f_copy_8:
				173	movs r12, r2, lsr #0x3
				174	beq .Lneon_b2f_copy_4
				175	.balignl 64, NOP_OPCODE, 4*2
				176	.Lneon_b2f_copy_8_loop:
				177	sub r1, r1, #8 /* Predecrement */
				178	sub r0, r0, #8
				179	vld1.32 {d0}, [r1]
				180	subs r12, r12, #1
				181	vst1.32 {d0}, [r0]
				182	bne .Lneon_b2f_copy_8_loop
				183	ands r2, r2, #0x7
				184	beq .Lneon_memmove_done
				185	.Lneon_b2f_copy_4:
				186	movs r12, r2, lsr #0x2
				187	beq .Lneon_b2f_copy_1
				188	.Lneon_b2f_copy_4_loop:
				189	ldr r3, [r1, #-4]!
				190	subs r12, r12, #1
				191	str r3, [r0, #-4]!
				192	bne .Lneon_b2f_copy_4_loop
				193	ands r2, r2, #0x3
				194	.Lneon_b2f_copy_1:
				195	cmp r2, #0
				196	beq .Lneon_memmove_done
				197	.balignl 64, NOP_OPCODE, 4*2
				198	.Lneon_b2f_copy_1_loop:
				199	ldrb r12, [r1, #-1]!
				200	subs r2, r2, #1
				201	strb r12, [r0, #-1]!
				202	bne .Lneon_b2f_copy_1_loop
				203
				204	.Lneon_memmove_done:
				205	pop {r0}
				206	bx lr
				207
				208	.end
				209
				210	#elif defined(SCORPION_NEON_OPTIMIZATION)
				211	/*
				212	* These can be overridden in:
				213	* device/<vendor>/<board>/BoardConfig.mk
				214	* by setting the following:
				215	* TARGET_USE_SCORPION_BIONIC_OPTIMIZATION := true
				216	* TARGET_USE_SCORPION_PLD_SET := true
				217	* TARGET_SCORPION_BIONIC_PLDOFFS := <pldoffset>
				218	* TARGET_SCORPION_BIONIC_PLDSIZE := <pldsize>
				219	*/
				220	#ifndef PLDOFFS
				221	#define PLDOFFS (6)
				222	#endif
				223	#ifndef PLDSIZE
				224	#define PLDSIZE (128) /* L2 cache line size */
				225	#endif
				226
				227	.code 32
				228	.align 5
				229	.global memmove
				230	.type memmove, %function
				231
				232	.global bcopy
				233	.type bcopy, %function
				234
				235	bcopy:
				236	mov r12, r0
				237	mov r0, r1
				238	mov r1, r12
				239	memmove:
				240	push {r0}
				241
				242	/*
				243	* The requirements for memmove state that the function should
				244	* operate as if data were being copied from the source to a
				245	* buffer, then to the destination. This is to allow a user
				246	* to copy data from a source and target that overlap.
				247	*
				248	* We can't just do byte copies front-to-back automatically, since
				249	* there's a good chance we may have an overlap (why else would someone
				250	* intentionally use memmove then?).
				251	*
				252	* We'll break this into two parts. Front-to-back, or back-to-front
				253	* copies.
				254	*/
				255	.Lneon_memmove_cmf:
				256	cmp r0, r1
				257	blt .Lneon_front_to_back_copy
				258	bgt .Lneon_back_to_front_copy
				259	b .Lneon_memmove_done
				260
				261	/* #############################################################
				262	* Front to Back copy
				263	*/
				264	.Lneon_front_to_back_copy:
				265	/*
				266	* For small copies, just do a quick memcpy. We can do this for
				267	* front-to-back copies, aligned or unaligned, since we're only
				268	* doing 1 byte at a time...
				269	*/
				270	cmp r2, #4
				271	bgt .Lneon_f2b_gt4
				272	cmp r2, #0
				273	.Lneon_f2b_smallcopy_loop:
				274	beq .Lneon_memmove_done
				275	ldrb r12, [r1], #1
				276	subs r2, r2, #1
				277	strb r12, [r0], #1
				278	b .Lneon_f2b_smallcopy_loop
				279	.Lneon_f2b_gt4:
				280	/* The window size is in r3. */
				281	sub r3, r1, r0
				282	/* #############################################################
				283	* Front to Back copy
				284	*/
				285	/*
				286	* Note that we can't just route based on the size in r2. If that's
				287	* larger than the overlap window in r3, we could potentially
				288	* (and likely!) destroy data we're copying.
				289	*/
				290	cmp r2, r3
				291	movle r12, r2
				292	movgt r12, r3
				293	cmp r12, #256
				294	bge .Lneon_f2b_copy_128
				295	cmp r12, #64
				296	bge .Lneon_f2b_copy_32
				297	cmp r12, #16
				298	bge .Lneon_f2b_copy_16
				299	cmp r12, #8
				300	bge .Lneon_f2b_copy_8
				301	cmp r12, #4
				302	bge .Lneon_f2b_copy_4
				303	b .Lneon_f2b_copy_1
				304	nop
				305	.Lneon_f2b_copy_128:
				306	mov r12, r2, lsr #7
				307	cmp r12, #PLDOFFS
				308	ble .Lneon_f2b_copy_128_loop_nopld
				309	sub r12, #PLDOFFS
				310	pld [r1, #(PLDOFFS-1)*PLDSIZE]
				311	.Lneon_f2b_copy_128_loop_outer:
				312	pld [r1, #(PLDOFFS*PLDSIZE)]
				313	vld1.32 {q0,q1}, [r1]!
				314	vld1.32 {q2,q3}, [r1]!
				315	vld1.32 {q8,q9}, [r1]!
				316	vld1.32 {q10,q11}, [r1]!
				317	subs r12, r12, #1
				318	vst1.32 {q0,q1}, [r0]!
				319	vst1.32 {q2,q3}, [r0]!
				320	vst1.32 {q8,q9}, [r0]!
				321	vst1.32 {q10,q11}, [r0]!
				322	bne .Lneon_f2b_copy_128_loop_outer
				323	mov r12, #PLDOFFS
				324	.Lneon_f2b_copy_128_loop_nopld:
				325	vld1.32 {q0,q1}, [r1]!
				326	vld1.32 {q2,q3}, [r1]!
				327	vld1.32 {q8,q9}, [r1]!
				328	vld1.32 {q10,q11}, [r1]!
				329	subs r12, r12, #1
				330	vst1.32 {q0,q1}, [r0]!
				331	vst1.32 {q2,q3}, [r0]!
				332	vst1.32 {q8,q9}, [r0]!
				333	vst1.32 {q10,q11}, [r0]!
				334	bne .Lneon_f2b_copy_128_loop_nopld
				335	ands r2, r2, #0x7f
				336	beq .Lneon_memmove_done
				337	cmp r2, #32
				338	bge .Lneon_f2b_copy_32
				339	b .Lneon_f2b_copy_finish
				340	.Lneon_f2b_copy_32:
				341	mov r12, r2, lsr #5
				342	.Lneon_f2b_copy_32_loop:
				343	vld1.32 {q0,q1}, [r1]!
				344	subs r12, r12, #1
				345	vst1.32 {q0,q1}, [r0]!
				346	bne .Lneon_f2b_copy_32_loop
				347	ands r2, r2, #0x1f
				348	beq .Lneon_memmove_done
				349	.Lneon_f2b_copy_finish:
				350	.Lneon_f2b_copy_16:
				351	movs r12, r2, lsr #4
				352	beq .Lneon_f2b_copy_8
				353	.Lneon_f2b_copy_16_loop:
				354	vld1.32 {q0}, [r1]!
				355	subs r12, r12, #1
				356	vst1.32 {q0}, [r0]!
				357	bne .Lneon_f2b_copy_16_loop
				358	ands r2, r2, #0xf
				359	beq .Lneon_memmove_done
				360	.Lneon_f2b_copy_8:
				361	movs r12, r2, lsr #3
				362	beq .Lneon_f2b_copy_4
				363	.Lneon_f2b_copy_8_loop:
				364	vld1.32 {d0}, [r1]!
				365	subs r12, r12, #1
				366	vst1.32 {d0}, [r0]!
				367	bne .Lneon_f2b_copy_8_loop
				368	ands r2, r2, #0x7
				369	beq .Lneon_memmove_done
				370	.Lneon_f2b_copy_4:
				371	movs r12, r2, lsr #2
				372	beq .Lneon_f2b_copy_1
				373	.Lneon_f2b_copy_4_loop:
				374	ldr r3, [r1], #4
				375	subs r12, r12, #1
				376	str r3, [r0], #4
				377	bne .Lneon_f2b_copy_4_loop
				378	ands r2, r2, #0x3
				379	nop
				380	.Lneon_f2b_copy_1:
				381	cmp r2, #0
				382	beq .Lneon_memmove_done
				383	.Lneon_f2b_copy_1_loop:
				384	ldrb r12, [r1], #1
				385	subs r2, r2, #1
				386	strb r12, [r0], #1
				387	bne .Lneon_f2b_copy_1_loop
				388	.Lneon_f2b_finish:
				389	b .Lneon_memmove_done
				390
				391	/* #############################################################
				392	* Back to Front copy
				393	*/
				394	.Lneon_back_to_front_copy:
				395	/*
				396	* Here, we'll want to shift to the end of the buffers. This
				397	* actually points us one past where we need to go, but since
				398	* we'll pre-decrement throughout, this will be fine.
				399	*/
				400	add r0, r0, r2
				401	add r1, r1, r2
				402	cmp r2, #4
				403	bgt .Lneon_b2f_gt4
				404	cmp r2, #0
				405	.Lneon_b2f_smallcopy_loop:
				406	beq .Lneon_memmove_done
				407	ldrb r12, [r1, #-1]!
				408	subs r2, r2, #1
				409	strb r12, [r0, #-1]!
				410	b .Lneon_b2f_smallcopy_loop
				411	.Lneon_b2f_gt4:
				412	/*
				413	* The minimum of the overlap window size and the copy size
				414	* is in r3.
				415	*/
				416	sub r3, r0, r1
				417	/*
				418	* #############################################################
				419	* Back to Front copy -
				420	*/
				421	cmp r2, r3
				422	movle r12, r2
				423	movgt r12, r3
				424	cmp r12, #256
				425	bge .Lneon_b2f_copy_128
				426	cmp r12, #64
				427	bge .Lneon_b2f_copy_32
				428	cmp r12, #8
				429	bge .Lneon_b2f_copy_8
				430	cmp r12, #4
				431	bge .Lneon_b2f_copy_4
				432	b .Lneon_b2f_copy_1
				433	nop
				434	.Lneon_b2f_copy_128:
				435	movs r12, r2, lsr #7
				436	cmp r12, #PLDOFFS
				437	ble .Lneon_b2f_copy_128_loop_nopld
				438	sub r12, #PLDOFFS
				439	pld [r1, #-(PLDOFFS-1)*PLDSIZE]
				440	.Lneon_b2f_copy_128_loop_outer:
				441	pld [r1, #-(PLDOFFS*PLDSIZE)]
				442	sub r1, r1, #128
				443	sub r0, r0, #128
				444	vld1.32 {q0, q1}, [r1]!
				445	vld1.32 {q2, q3}, [r1]!
				446	vld1.32 {q8, q9}, [r1]!
				447	vld1.32 {q10, q11}, [r1]!
				448	subs r12, r12, #1
				449	vst1.32 {q0, q1}, [r0]!
				450	vst1.32 {q2, q3}, [r0]!
				451	vst1.32 {q8, q9}, [r0]!
				452	vst1.32 {q10, q11}, [r0]!
				453	sub r1, r1, #128
				454	sub r0, r0, #128
				455	bne .Lneon_b2f_copy_128_loop_outer
				456	mov r12, #PLDOFFS
				457	.Lneon_b2f_copy_128_loop_nopld:
				458	sub r1, r1, #128
				459	sub r0, r0, #128
				460	vld1.32 {q0, q1}, [r1]!
				461	vld1.32 {q2, q3}, [r1]!
				462	vld1.32 {q8, q9}, [r1]!
				463	vld1.32 {q10, q11}, [r1]!
				464	subs r12, r12, #1
				465	vst1.32 {q0, q1}, [r0]!
				466	vst1.32 {q2, q3}, [r0]!
				467	vst1.32 {q8, q9}, [r0]!
				468	vst1.32 {q10, q11}, [r0]!
				469	sub r1, r1, #128
				470	sub r0, r0, #128
				471	bne .Lneon_b2f_copy_128_loop_nopld
				472	ands r2, r2, #0x7f
				473	beq .Lneon_memmove_done
				474	cmp r2, #32
				475	bge .Lneon_b2f_copy_32
				476	b .Lneon_b2f_copy_finish
				477	.Lneon_b2f_copy_32:
				478	mov r12, r2, lsr #5
				479	.Lneon_b2f_copy_32_loop:
				480	sub r1, r1, #32
				481	sub r0, r0, #32
				482	vld1.32 {q0,q1}, [r1]
				483	subs r12, r12, #1
				484	vst1.32 {q0,q1}, [r0]
				485	bne .Lneon_b2f_copy_32_loop
				486	ands r2, r2, #0x1f
				487	beq .Lneon_memmove_done
				488	.Lneon_b2f_copy_finish:
				489	.Lneon_b2f_copy_8:
				490	movs r12, r2, lsr #0x3
				491	beq .Lneon_b2f_copy_4
				492	.Lneon_b2f_copy_8_loop:
				493	sub r1, r1, #8
				494	sub r0, r0, #8
				495	vld1.32 {d0}, [r1]
				496	subs r12, r12, #1
				497	vst1.32 {d0}, [r0]
				498	bne .Lneon_b2f_copy_8_loop
				499	ands r2, r2, #0x7
				500	beq .Lneon_memmove_done
				501	.Lneon_b2f_copy_4:
				502	movs r12, r2, lsr #0x2
				503	beq .Lneon_b2f_copy_1
				504	.Lneon_b2f_copy_4_loop:
				505	ldr r3, [r1, #-4]!
				506	subs r12, r12, #1
				507	str r3, [r0, #-4]!
				508	bne .Lneon_b2f_copy_4_loop
				509	ands r2, r2, #0x3
				510	nop
				511	.Lneon_b2f_copy_1:
				512	cmp r2, #0
				513	beq .Lneon_memmove_done
				514	.Lneon_b2f_copy_1_loop:
				515	ldrb r12, [r1, #-1]!
				516	subs r2, r2, #1
				517	strb r12, [r0, #-1]!
				518	bne .Lneon_b2f_copy_1_loop
				519
				520	.Lneon_memmove_done:
				521	pop {r0}
				522	bx lr
				523
				524	.end
				525	#endif /* SCORPION_NEON_OPTIMIZATION */
				526