Blame - libc/arch-arm/bionic/memcpy.S - android_bionic

blob: 7e1a79959de6c407f961f280a3eeeb81f2f42bfc [file] [log] [blame]

The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	1	/*
				2	* Copyright (C) 2008 The Android Open Source Project
				3	* All rights reserved.
				4	*
				5	* Redistribution and use in source and binary forms, with or without
				6	* modification, are permitted provided that the following conditions
				7	* are met:
				8	* * Redistributions of source code must retain the above copyright
				9	* notice, this list of conditions and the following disclaimer.
				10	* * Redistributions in binary form must reproduce the above copyright
				11	* notice, this list of conditions and the following disclaimer in
				12	* the documentation and/or other materials provided with the
				13	* distribution.
				14	*
				15	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				16	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				17	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
				18	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
				19	* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
				20	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
				21	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
				22	* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
				23	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
				24	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
				25	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
				26	* SUCH DAMAGE.
				27	*/
				28
				29	#include <machine/cpu-features.h>
Kenny Root	420878c	2011-02-16 11:55:58 -0800	[diff] [blame]	30	#include <machine/asm.h>
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	31
Prajakta Gudadhe	08e72d0	2012-05-07 14:17:44 -0700	[diff] [blame]	32	#if defined(__ARM_NEON__) && !defined(ARCH_ARM_USE_NON_NEON_MEMCPY)
Brent DeGraaf	a8c0221	2012-05-30 22:50:19 -0400	[diff] [blame]	33	#if defined(KRAIT_NEON_OPTIMIZATION)
				34	/*
				35	* These can be overridden in:
				36	* device/<vendor>/<board>/BoardConfig.mk
				37	* by setting the following:
				38	* TARGET_USE_KRAIT_BIONIC_OPTIMIZATION := true
				39	* TARGET_USE_KRAIT_PLD_SET := true
				40	* TARGET_KRAIT_BIONIC_PLDOFFS := <pldoffset>
				41	* TARGET_KRAIT_BIONIC_PLDSIZE := <pldsize>
				42	* TARGET_KRAIT_BIONIC_PLDTHRESH := <pldthreshold>
				43	* TARGET_KRAIT_BIONIC_BBTHRESH := <bbthreshold>
				44	*/
				45	#ifndef PLDOFFS
				46	#define PLDOFFS (10)
				47	#endif
				48	#ifndef PLDTHRESH
				49	#define PLDTHRESH (PLDOFFS)
				50	#endif
				51	#ifndef BBTHRESH
				52	#define BBTHRESH (4096/64)
				53	#endif
				54	#if (PLDOFFS < 1)
				55	#error Routine does not support offsets less than 1
				56	#endif
				57	#if (PLDTHRESH < PLDOFFS)
				58	#error PLD threshold must be greater than or equal to the PLD offset
				59	#endif
				60	#ifndef PLDSIZE
				61	#define PLDSIZE (64)
				62	#endif
				63	#define NOP_OPCODE (0xe320f000)
				64
				65	.text
				66	.fpu neon
				67	.global memcpy
				68	.type memcpy, %function
				69	.align 5
				70	memcpy:
				71	stmfd sp!, {r0, r9, r10, lr}
				72	cmp r2, #4
				73	blt .Lneon_lt4
				74	cmp r2, #16
				75	blt .Lneon_lt16
				76	cmp r2, #32
				77	blt .Lneon_16
				78	cmp r2, #64
				79	blt .Lneon_copy_32_a
				80
				81	mov r12, r2, lsr #6
				82	cmp r12, #PLDTHRESH
				83	ble .Lneon_copy_64_loop_nopld
				84
				85	cmp r12, #BBTHRESH
				86	ble .Lneon_prime_pump
				87
				88	add lr, r0, #0x400
				89	add r9, r1, #(PLDOFFS*PLDSIZE)
				90	sub lr, lr, r9
				91	lsl lr, lr, #21
				92	lsr lr, lr, #21
				93	add lr, lr, #(PLDOFFS*PLDSIZE)
				94	cmp r12, lr, lsr #6
				95	movle lr, #(PLDOFFS*PLDSIZE)
				96
				97	movgt r9, #(PLDOFFS)
				98	rsbgts r9, r9, lr, lsr #6
				99	ble .Lneon_prime_pump
				100
				101	add r10, r1, lr
				102	bic r10, #0x3F
				103
				104	sub r12, lr, lsr #6
				105	cmp r9, r12
				106	suble r12, r12, r9
				107	movgt r9, r12
				108	movgt r12, #0
				109
				110	pld [r1, #((PLDOFFS-1)*PLDSIZE)]
				111	.balignl 64, NOP_OPCODE, 4*2
				112	.Lneon_copy_64_loop_outer_doublepld:
				113	pld [r1, #((PLDOFFS)*PLDSIZE)]
				114	vld1.32 {q0, q1}, [r1]!
				115	vld1.32 {q2, q3}, [r1]!
				116	ldr r3, [r10]
				117	subs r9, r9, #1
				118	vst1.32 {q0, q1}, [r0]!
				119	vst1.32 {q2, q3}, [r0]!
				120	add r10, #64
				121	bne .Lneon_copy_64_loop_outer_doublepld
				122	cmp r12, #0
				123	bne .Lneon_copy_64_loop_outer
				124	mov r12, lr, lsr #6
				125	b .Lneon_copy_64_loop_nopld
				126	.balignl 64, NOP_OPCODE, 4*2
				127	.Lneon_prime_pump:
				128	mov lr, #(PLDOFFS*PLDSIZE)
				129	add r10, r1, #(PLDOFFS*PLDSIZE)
				130	bic r10, #0x3F
				131	sub r12, r12, #PLDOFFS
				132	pld [r10, #(-1*PLDSIZE)]
				133	.balignl 64, NOP_OPCODE, 4*2
				134	.Lneon_copy_64_loop_outer:
				135	vld1.32 {q0, q1}, [r1]!
				136	vld1.32 {q2, q3}, [r1]!
				137	ldr r3, [r10]
				138	subs r12, r12, #1
				139	vst1.32 {q0, q1}, [r0]!
				140	vst1.32 {q2, q3}, [r0]!
				141	add r10, #64
				142	bne .Lneon_copy_64_loop_outer
				143	mov r12, lr, lsr #6
				144	.balignl 64, NOP_OPCODE, 4*2
				145	.Lneon_copy_64_loop_nopld:
				146	vld1.32 {q8, q9}, [r1]!
				147	vld1.32 {q10, q11}, [r1]!
				148	subs r12, r12, #1
				149	vst1.32 {q8, q9}, [r0]!
				150	vst1.32 {q10, q11}, [r0]!
				151	bne .Lneon_copy_64_loop_nopld
				152	ands r2, r2, #0x3f
				153	beq .Lneon_exit
				154	.balignl 64, NOP_OPCODE, 4*2
				155	.Lneon_copy_32_a:
				156	movs r12, r2, lsl #27
				157	bcc .Lneon_16
				158	vld1.32 {q0,q1}, [r1]!
				159	vst1.32 {q0,q1}, [r0]!
				160	.balignl 64, NOP_OPCODE, 4*2
				161	.Lneon_16:
				162	bpl .Lneon_lt16
				163	vld1.32 {q8}, [r1]!
				164	vst1.32 {q8}, [r0]!
				165	ands r2, r2, #0x0f
				166	beq .Lneon_exit
				167	.balignl 64, NOP_OPCODE, 4*2
				168	.Lneon_lt16:
				169	movs r12, r2, lsl #29
				170	ldrcs r3, [r1], #4
				171	ldrcs r12, [r1], #4
				172	strcs r3, [r0], #4
				173	strcs r12, [r0], #4
				174	ldrmi r3, [r1], #4
				175	strmi r3, [r0], #4
				176	.balignl 64, NOP_OPCODE, 4*2
				177	.Lneon_lt4:
				178	movs r2, r2, lsl #31
				179	ldrcsh r3, [r1], #2
				180	strcsh r3, [r0], #2
				181	ldrmib r12, [r1]
				182	strmib r12, [r0]
				183	.balignl 64, NOP_OPCODE, 4*2
				184	.Lneon_exit:
				185	ldmfd sp!, {r0, r9, r10, lr}
				186	bx lr
				187	.end
				188	#elif defined(SCORPION_NEON_OPTIMIZATION)
				189	/*
				190	* These can be overridden in:
				191	* device/<vendor>/<board>/BoardConfig.mk
				192	* by setting the following:
				193	* TARGET_USE_SCORPION_BIONIC_OPTIMIZATION := true
				194	* TARGET_USE_SCORPION_PLD_SET := true
				195	* TARGET_SCORPION_BIONIC_PLDOFFS := <pldoffset>
				196	* TARGET_SCORPION_BIONIC_PLDSIZE := <pldsize>
				197	*/
				198	#ifndef PLDOFFS
				199	#define PLDOFFS (6)
				200	#endif
				201	#ifndef PLDSIZE
				202	#define PLDSIZE (128) /* L2 cache line size */
				203	#endif
				204	.code 32
				205	.align 5
				206	.globl memcpy
				207	.func
				208	memcpy:
				209	push {r0}
				210	cmp r2, #4
				211	blt .Lneon_lt4
				212	cmp r2, #16
				213	blt .Lneon_lt16
				214	cmp r2, #32
				215	blt .Lneon_16
				216	cmp r2, #128
				217	blt .Lneon_copy_32_a
				218	/* Copy blocks of 128-bytes (word-aligned) at a time*/
				219	/* Code below is optimized for PLDSIZE=128 only */
				220	mov r12, r2, lsr #7
				221	cmp r12, #PLDOFFS
				222	ble .Lneon_copy_128_loop_nopld
				223	sub r12, #PLDOFFS
				224	pld [r1, #(PLDOFFS-1)*PLDSIZE]
				225	.Lneon_copy_128_loop_outer:
				226	pld [r1, #(PLDOFFS*PLDSIZE)]
				227	vld1.32 {q0, q1}, [r1]!
				228	vld1.32 {q2, q3}, [r1]!
				229	vld1.32 {q8, q9}, [r1]!
				230	vld1.32 {q10, q11}, [r1]!
				231	subs r12, r12, #1
				232	vst1.32 {q0, q1}, [r0]!
				233	vst1.32 {q2, q3}, [r0]!
				234	vst1.32 {q8, q9}, [r0]!
				235	vst1.32 {q10, q11}, [r0]!
				236	bne .Lneon_copy_128_loop_outer
				237	mov r12, #PLDOFFS
				238	.Lneon_copy_128_loop_nopld:
				239	vld1.32 {q0, q1}, [r1]!
				240	vld1.32 {q2, q3}, [r1]!
				241	vld1.32 {q8, q9}, [r1]!
				242	vld1.32 {q10, q11}, [r1]!
				243	subs r12, r12, #1
				244	vst1.32 {q0, q1}, [r0]!
				245	vst1.32 {q2, q3}, [r0]!
				246	vst1.32 {q8, q9}, [r0]!
				247	vst1.32 {q10, q11}, [r0]!
				248	bne .Lneon_copy_128_loop_nopld
				249	ands r2, r2, #0x7f
				250	beq .Lneon_exit
				251	cmp r2, #32
				252	blt .Lneon_16
				253	nop
				254	/* Copy blocks of 32-bytes (word aligned) at a time*/
				255	.Lneon_copy_32_a:
				256	mov r12, r2, lsr #5
				257	.Lneon_copy_32_loop_a:
				258	vld1.32 {q0,q1}, [r1]!
				259	subs r12, r12, #1
				260	vst1.32 {q0,q1}, [r0]!
				261	bne .Lneon_copy_32_loop_a
				262	ands r2, r2, #0x1f
				263	beq .Lneon_exit
				264	.Lneon_16:
				265	subs r2, r2, #16
				266	blt .Lneon_lt16
				267	vld1.32 {q8}, [r1]!
				268	vst1.32 {q8}, [r0]!
				269	beq .Lneon_exit
				270	.Lneon_lt16:
				271	movs r12, r2, lsl #29
				272	bcc .Lneon_skip8
				273	ldr r3, [r1], #4
				274	ldr r12, [r1], #4
				275	str r3, [r0], #4
				276	str r12, [r0], #4
				277	.Lneon_skip8:
				278	bpl .Lneon_lt4
				279	ldr r3, [r1], #4
				280	str r3, [r0], #4
				281	.Lneon_lt4:
				282	movs r2, r2, lsl #31
				283	bcc .Lneon_lt2
				284	ldrh r3, [r1], #2
				285	strh r3, [r0], #2
				286	.Lneon_lt2:
				287	bpl .Lneon_exit
				288	ldrb r12, [r1]
				289	strb r12, [r0]
				290	.Lneon_exit:
				291	pop {r0}
				292	bx lr
				293	.endfunc
				294	.end
				295	#else /* !SCORPION_NEON_OPTIMIZATION */
				296	#if defined(CORTEX_CACHE_LINE_32)
				297	/*
				298	*This can be enabled by setting flag
				299	*TARGET_CORTEX_CACHE_LINE_32 in
				300	*device/<vendor>/<board>/BoardConfig.mk
				301	*/
				302	.text
				303	.fpu neon
				304
				305	.global memcpy
				306	.type memcpy, %function
				307	.align 4
				308
				309	/* a prefetch distance of 4 cache-lines works best experimentally */
				310	#define CACHE_LINE_SIZE 32
				311	memcpy:
				312	.fnstart
				313	.save {r0, lr}
				314	stmfd sp!, {r0, lr}
				315
				316	/* start preloading as early as possible */
				317	pld [r1, #(CACHE_LINE_SIZE*0)]
				318	pld [r1, #(CACHE_LINE_SIZE*1)]
				319
				320	/* do we have at least 16-bytes to copy (needed for alignment below) */
				321	cmp r2, #16
				322	blo 5f
				323
				324	/* align destination to half cache-line for the write-buffer */
				325	rsb r3, r0, #0
				326	ands r3, r3, #0xF
				327	beq 0f
				328
				329	/* copy up to 15-bytes (count in r3) */
				330	sub r2, r2, r3
				331	movs ip, r3, lsl #31
				332	ldrmib lr, [r1], #1
				333	strmib lr, [r0], #1
				334	ldrcsb ip, [r1], #1
				335	ldrcsb lr, [r1], #1
				336	strcsb ip, [r0], #1
				337	strcsb lr, [r0], #1
				338	movs ip, r3, lsl #29
				339	bge 1f
				340	// copies 4 bytes, destination 32-bits aligned
				341	vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
				342	vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
				343	1: bcc 2f
				344	// copies 8 bytes, destination 64-bits aligned
				345	vld1.8 {d0}, [r1]!
				346	vst1.8 {d0}, [r0, :64]!
				347	2:
				348
				349	0: /* preload immediately the next cache line, which we may need */
				350	pld [r1, #(CACHE_LINE_SIZE*0)]
				351	pld [r1, #(CACHE_LINE_SIZE*1)]
				352
				353	/* make sure we have at least 128 bytes to copy */
				354	subs r2, r2, #128
				355	blo 2f
				356
				357	/* preload all the cache lines we need.
				358	* NOTE: the number of pld below depends on PREFETCH_DISTANCE,
				359	* ideally would would increase the distance in the main loop to
				360	* avoid the goofy code below. In practice this doesn't seem to make
				361	* a big difference.
				362	*/
				363	pld [r1, #(CACHE_LINE_SIZE*2)]
				364	pld [r1, #(CACHE_LINE_SIZE*3)]
				365	pld [r1, #(CACHE_LINE_SIZE*4)]
				366
				367	.align 3
				368	1: /* The main loop copies 128 bytes at a time */
				369	subs r2, r2, #128
				370	vld1.8 {d0 - d3}, [r1]!
				371	vld1.8 {d4 - d7}, [r1]!
				372	pld [r1, #(CACHE_LINE_SIZE*1)]
				373	pld [r1, #(CACHE_LINE_SIZE*2)]
				374	vld1.8 {d16 - d19}, [r1]!
				375	vld1.8 {d20 - d23}, [r1]!
				376	pld [r1, #(CACHE_LINE_SIZE*1)]
				377	pld [r1, #(CACHE_LINE_SIZE*2)]
				378	vst1.8 {d0 - d3}, [r0, :128]!
				379	vst1.8 {d4 - d7}, [r0, :128]!
				380	vst1.8 {d16 - d19}, [r0, :128]!
				381	vst1.8 {d20 - d23}, [r0, :128]!
				382	bhs 1b
				383
				384	2: /* fix-up the remaining count and make sure we have >= 32 bytes left */
				385	add r2, r2, #128
				386	subs r2, r2, #32
				387	blo 4f
				388
				389	3: /* 32 bytes at a time. These cache lines were already preloaded */
				390	vld1.8 {d0 - d3}, [r1]!
				391	subs r2, r2, #32
				392	vst1.8 {d0 - d3}, [r0, :128]!
				393	bhs 3b
				394
				395	4: /* less than 32 left */
				396	add r2, r2, #32
				397	tst r2, #0x10
				398	beq 5f
				399	// copies 16 bytes, 128-bits aligned
				400	vld1.8 {d0, d1}, [r1]!
				401	vst1.8 {d0, d1}, [r0, :128]!
				402
				403	5: /* copy up to 15-bytes (count in r2) */
				404	movs ip, r2, lsl #29
				405	bcc 1f
				406	vld1.8 {d0}, [r1]!
				407	vst1.8 {d0}, [r0]!
				408	1: bge 2f
				409	vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
				410	vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!
				411	2: movs ip, r2, lsl #31
				412	ldrmib r3, [r1], #1
				413	ldrcsb ip, [r1], #1
				414	ldrcsb lr, [r1], #1
				415	strmib r3, [r0], #1
				416	strcsb ip, [r0], #1
				417	strcsb lr, [r0], #1
				418
				419	ldmfd sp!, {r0, lr}
				420	bx lr
				421	.fnend
				422	#else /!CORTEX_CACHE_LINE_32/
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	423
				424	.text
				425	.fpu neon
				426
Henrik Smiding	fe6338d	2010-09-15 16:08:03 +0200	[diff] [blame]	427	#ifdef HAVE_32_BYTE_CACHE_LINE
				428	/* a prefetch distance of 2 cache-lines */
				429	#define CACHE_LINE_SIZE 32
				430	#define PREFETCH_DISTANCE (CACHE_LINE_SIZE*2)
				431	#else
Mathias Agopian	199f9d9	2009-10-28 02:54:37 -0700	[diff] [blame]	432	/* a prefetch distance of 4 cache-lines works best experimentally */
				433	#define CACHE_LINE_SIZE 64
				434	#define PREFETCH_DISTANCE (CACHE_LINE_SIZE*4)
Henrik Smiding	fe6338d	2010-09-15 16:08:03 +0200	[diff] [blame]	435	#endif
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	436
Evgeniy Stepanov	487b613	2011-10-04 14:22:15 +0400	[diff] [blame]	437	ENTRY(memcpy)
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	438	.save {r0, lr}
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	439	/* start preloading as early as possible */
Mathias Agopian	199f9d9	2009-10-28 02:54:37 -0700	[diff] [blame]	440	pld [r1, #(CACHE_LINE_SIZE*0)]
Henrik Smiding	fe6338d	2010-09-15 16:08:03 +0200	[diff] [blame]	441	stmfd sp!, {r0, lr}
Mathias Agopian	199f9d9	2009-10-28 02:54:37 -0700	[diff] [blame]	442	pld [r1, #(CACHE_LINE_SIZE*1)]
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	443
				444	/* do we have at least 16-bytes to copy (needed for alignment below) */
				445	cmp r2, #16
				446	blo 5f
				447
Henrik Smiding	fe6338d	2010-09-15 16:08:03 +0200	[diff] [blame]	448	/* align destination to cache-line for the write-buffer */
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	449	rsb r3, r0, #0
				450	ands r3, r3, #0xF
				451	beq 0f
				452
				453	/* copy up to 15-bytes (count in r3) */
				454	sub r2, r2, r3
				455	movs ip, r3, lsl #31
				456	ldrmib lr, [r1], #1
				457	strmib lr, [r0], #1
				458	ldrcsb ip, [r1], #1
				459	ldrcsb lr, [r1], #1
				460	strcsb ip, [r0], #1
				461	strcsb lr, [r0], #1
				462	movs ip, r3, lsl #29
				463	bge 1f
				464	// copies 4 bytes, destination 32-bits aligned
				465	vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
				466	vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
				467	1: bcc 2f
				468	// copies 8 bytes, destination 64-bits aligned
				469	vld1.8 {d0}, [r1]!
				470	vst1.8 {d0}, [r0, :64]!
				471	2:
				472
				473	0: /* preload immediately the next cache line, which we may need */
Mathias Agopian	199f9d9	2009-10-28 02:54:37 -0700	[diff] [blame]	474	pld [r1, #(CACHE_LINE_SIZE*0)]
				475	pld [r1, #(CACHE_LINE_SIZE*1)]
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	476
Henrik Smiding	fe6338d	2010-09-15 16:08:03 +0200	[diff] [blame]	477	#ifdef HAVE_32_BYTE_CACHE_LINE
				478	/* make sure we have at least 32 bytes to copy */
				479	subs r2, r2, #32
				480	blo 4f
				481
				482	/* preload all the cache lines we need.
				483	* NOTE: the number of pld below depends on PREFETCH_DISTANCE,
				484	* ideally would would increase the distance in the main loop to
				485	* avoid the goofy code below. In practice this doesn't seem to make
				486	* a big difference.
				487	*/
				488	pld [r1, #(PREFETCH_DISTANCE)]
				489
				490	1: /* The main loop copies 32 bytes at a time */
				491	vld1.8 {d0 - d3}, [r1]!
				492	pld [r1, #(PREFETCH_DISTANCE)]
				493	subs r2, r2, #32
				494	vst1.8 {d0 - d3}, [r0, :128]!
				495	bhs 1b
				496	#else
Mathias Agopian	199f9d9	2009-10-28 02:54:37 -0700	[diff] [blame]	497	/* make sure we have at least 64 bytes to copy */
				498	subs r2, r2, #64
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	499	blo 2f
				500
				501	/* preload all the cache lines we need.
				502	* NOTE: the number of pld below depends on PREFETCH_DISTANCE,
				503	* ideally would would increase the distance in the main loop to
				504	* avoid the goofy code below. In practice this doesn't seem to make
				505	* a big difference.
				506	*/
Mathias Agopian	199f9d9	2009-10-28 02:54:37 -0700	[diff] [blame]	507	pld [r1, #(CACHE_LINE_SIZE*2)]
				508	pld [r1, #(CACHE_LINE_SIZE*3)]
				509	pld [r1, #(PREFETCH_DISTANCE)]
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	510
Mathias Agopian	199f9d9	2009-10-28 02:54:37 -0700	[diff] [blame]	511	1: /* The main loop copies 64 bytes at a time */
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	512	vld1.8 {d0 - d3}, [r1]!
				513	vld1.8 {d4 - d7}, [r1]!
Mathias Agopian	199f9d9	2009-10-28 02:54:37 -0700	[diff] [blame]	514	pld [r1, #(PREFETCH_DISTANCE)]
				515	subs r2, r2, #64
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	516	vst1.8 {d0 - d3}, [r0, :128]!
				517	vst1.8 {d4 - d7}, [r0, :128]!
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	518	bhs 1b
				519
				520	2: /* fix-up the remaining count and make sure we have >= 32 bytes left */
Mathias Agopian	199f9d9	2009-10-28 02:54:37 -0700	[diff] [blame]	521	add r2, r2, #64
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	522	subs r2, r2, #32
				523	blo 4f
				524
				525	3: /* 32 bytes at a time. These cache lines were already preloaded */
				526	vld1.8 {d0 - d3}, [r1]!
				527	subs r2, r2, #32
				528	vst1.8 {d0 - d3}, [r0, :128]!
				529	bhs 3b
Henrik Smiding	fe6338d	2010-09-15 16:08:03 +0200	[diff] [blame]	530	#endif
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	531	4: /* less than 32 left */
				532	add r2, r2, #32
				533	tst r2, #0x10
				534	beq 5f
				535	// copies 16 bytes, 128-bits aligned
				536	vld1.8 {d0, d1}, [r1]!
				537	vst1.8 {d0, d1}, [r0, :128]!
				538
				539	5: /* copy up to 15-bytes (count in r2) */
				540	movs ip, r2, lsl #29
				541	bcc 1f
				542	vld1.8 {d0}, [r1]!
				543	vst1.8 {d0}, [r0]!
				544	1: bge 2f
				545	vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
				546	vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!
				547	2: movs ip, r2, lsl #31
				548	ldrmib r3, [r1], #1
				549	ldrcsb ip, [r1], #1
				550	ldrcsb lr, [r1], #1
				551	strmib r3, [r0], #1
				552	strcsb ip, [r0], #1
				553	strcsb lr, [r0], #1
				554
				555	ldmfd sp!, {r0, lr}
				556	bx lr
Kenny Root	420878c	2011-02-16 11:55:58 -0800	[diff] [blame]	557	END(memcpy)
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	558
Brent DeGraaf	a8c0221	2012-05-30 22:50:19 -0400	[diff] [blame]	559	#endif /!CORTEX_CACHE_LINE_32/
				560	#endif /* SCORPION_NEON_OPTIMIZATION */
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	561	#else /* __ARM_ARCH__ < 7 */
				562
				563
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	564	/*
				565	* Optimized memcpy() for ARM.
				566	*
				567	* note that memcpy() always returns the destination pointer,
				568	* so we have to preserve R0.
				569	*/
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	570
Kenny Root	420878c	2011-02-16 11:55:58 -0800	[diff] [blame]	571	ENTRY(memcpy)
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	572	/* The stack must always be 64-bits aligned to be compliant with the
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	573	* ARM ABI. Since we have to save R0, we might as well save R4
				574	* which we can use for better pipelining of the reads below
				575	*/
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	576	.save {r0, r4, lr}
				577	stmfd sp!, {r0, r4, lr}
				578	/* Making room for r5-r11 which will be spilled later */
				579	.pad #28
				580	sub sp, sp, #28
				581
				582	// preload the destination because we'll align it to a cache line
				583	// with small writes. Also start the source "pump".
				584	PLD (r0, #0)
				585	PLD (r1, #0)
				586	PLD (r1, #32)
				587
				588	/* it simplifies things to take care of len<4 early */
				589	cmp r2, #4
				590	blo copy_last_3_and_return
				591
				592	/* compute the offset to align the source
				593	* offset = (4-(src&3))&3 = -src & 3
				594	*/
				595	rsb r3, r1, #0
				596	ands r3, r3, #3
				597	beq src_aligned
				598
				599	/* align source to 32 bits. We need to insert 2 instructions between
				600	* a ldr[b\|h] and str[b\|h] because byte and half-word instructions
				601	* stall 2 cycles.
				602	*/
				603	movs r12, r3, lsl #31
				604	sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */
				605	ldrmib r3, [r1], #1
				606	ldrcsb r4, [r1], #1
				607	ldrcsb r12,[r1], #1
				608	strmib r3, [r0], #1
				609	strcsb r4, [r0], #1
				610	strcsb r12,[r0], #1
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	611
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	612	src_aligned:
				613
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	614	/* see if src and dst are aligned together (congruent) */
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	615	eor r12, r0, r1
				616	tst r12, #3
				617	bne non_congruent
				618
				619	/* Use post-incriment mode for stm to spill r5-r11 to reserved stack
				620	* frame. Don't update sp.
				621	*/
				622	stmea sp, {r5-r11}
				623
				624	/* align the destination to a cache-line */
				625	rsb r3, r0, #0
				626	ands r3, r3, #0x1C
				627	beq congruent_aligned32
				628	cmp r3, r2
				629	andhi r3, r2, #0x1C
				630
				631	/* conditionnaly copies 0 to 7 words (length in r3) */
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	632	movs r12, r3, lsl #28
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	633	ldmcsia r1!, {r4, r5, r6, r7} /* 16 bytes */
				634	ldmmiia r1!, {r8, r9} /* 8 bytes */
				635	stmcsia r0!, {r4, r5, r6, r7}
				636	stmmiia r0!, {r8, r9}
				637	tst r3, #0x4
				638	ldrne r10,[r1], #4 /* 4 bytes */
				639	strne r10,[r0], #4
				640	sub r2, r2, r3
				641
				642	congruent_aligned32:
				643	/*
				644	* here source is aligned to 32 bytes.
				645	*/
				646
				647	cached_aligned32:
				648	subs r2, r2, #32
				649	blo less_than_32_left
				650
				651	/*
				652	* We preload a cache-line up to 64 bytes ahead. On the 926, this will
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	653	* stall only until the requested world is fetched, but the linefill
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	654	* continues in the the background.
				655	* While the linefill is going, we write our previous cache-line
				656	* into the write-buffer (which should have some free space).
				657	* When the linefill is done, the writebuffer will
				658	* start dumping its content into memory
				659	*
				660	* While all this is going, we then load a full cache line into
				661	* 8 registers, this cache line should be in the cache by now
				662	* (or partly in the cache).
				663	*
				664	* This code should work well regardless of the source/dest alignment.
				665	*
				666	*/
				667
				668	// Align the preload register to a cache-line because the cpu does
				669	// "critical word first" (the first word requested is loaded first).
				670	bic r12, r1, #0x1F
				671	add r12, r12, #64
				672
				673	1: ldmia r1!, { r4-r11 }
				674	PLD (r12, #64)
				675	subs r2, r2, #32
				676
				677	// NOTE: if r12 is more than 64 ahead of r1, the following ldrhi
				678	// for ARM9 preload will not be safely guarded by the preceding subs.
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	679	// When it is safely guarded the only possibility to have SIGSEGV here
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	680	// is because the caller overstates the length.
				681	ldrhi r3, [r12], #32 /* cheap ARM9 preload */
				682	stmia r0!, { r4-r11 }
				683	bhs 1b
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	684
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	685	add r2, r2, #32
				686
				687
				688
				689
				690	less_than_32_left:
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	691	/*
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	692	* less than 32 bytes left at this point (length in r2)
				693	*/
				694
				695	/* skip all this if there is nothing to do, which should
				696	* be a common case (if not executed the code below takes
				697	* about 16 cycles)
				698	*/
				699	tst r2, #0x1F
				700	beq 1f
				701
				702	/* conditionnaly copies 0 to 31 bytes */
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	703	movs r12, r2, lsl #28
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	704	ldmcsia r1!, {r4, r5, r6, r7} /* 16 bytes */
				705	ldmmiia r1!, {r8, r9} /* 8 bytes */
				706	stmcsia r0!, {r4, r5, r6, r7}
				707	stmmiia r0!, {r8, r9}
				708	movs r12, r2, lsl #30
				709	ldrcs r3, [r1], #4 /* 4 bytes */
				710	ldrmih r4, [r1], #2 /* 2 bytes */
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	711	strcs r3, [r0], #4
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	712	strmih r4, [r0], #2
				713	tst r2, #0x1
				714	ldrneb r3, [r1] /* last byte */
				715	strneb r3, [r0]
				716
				717	/* we're done! restore everything and return */
				718	1: ldmfd sp!, {r5-r11}
				719	ldmfd sp!, {r0, r4, lr}
				720	bx lr
				721
				722	/********************************************************************/
				723
				724	non_congruent:
				725	/*
				726	* here source is aligned to 4 bytes
				727	* but destination is not.
				728	*
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	729	* in the code below r2 is the number of bytes read
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	730	* (the number of bytes written is always smaller, because we have
				731	* partial words in the shift queue)
				732	*/
				733	cmp r2, #4
				734	blo copy_last_3_and_return
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	735
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	736	/* Use post-incriment mode for stm to spill r5-r11 to reserved stack
				737	* frame. Don't update sp.
				738	*/
				739	stmea sp, {r5-r11}
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	740
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	741	/* compute shifts needed to align src to dest */
				742	rsb r5, r0, #0
				743	and r5, r5, #3 /* r5 = # bytes in partial words */
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	744	mov r12, r5, lsl #3 /* r12 = right */
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	745	rsb lr, r12, #32 /* lr = left */
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	746
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	747	/* read the first word */
				748	ldr r3, [r1], #4
				749	sub r2, r2, #4
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	750
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	751	/* write a partial word (0 to 3 bytes), such that destination
				752	* becomes aligned to 32 bits (r5 = nb of words to copy for alignment)
				753	*/
				754	movs r5, r5, lsl #31
				755	strmib r3, [r0], #1
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	756	movmi r3, r3, lsr #8
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	757	strcsb r3, [r0], #1
				758	movcs r3, r3, lsr #8
				759	strcsb r3, [r0], #1
				760	movcs r3, r3, lsr #8
				761
				762	cmp r2, #4
				763	blo partial_word_tail
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	764
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	765	/* Align destination to 32 bytes (cache line boundary) */
				766	1: tst r0, #0x1c
				767	beq 2f
				768	ldr r5, [r1], #4
				769	sub r2, r2, #4
				770	orr r4, r3, r5, lsl lr
				771	mov r3, r5, lsr r12
				772	str r4, [r0], #4
				773	cmp r2, #4
				774	bhs 1b
				775	blo partial_word_tail
				776
				777	/* copy 32 bytes at a time */
				778	2: subs r2, r2, #32
				779	blo less_than_thirtytwo
				780
				781	/* Use immediate mode for the shifts, because there is an extra cycle
				782	* for register shifts, which could account for up to 50% of
				783	* performance hit.
				784	*/
				785
				786	cmp r12, #24
				787	beq loop24
				788	cmp r12, #8
				789	beq loop8
				790
				791	loop16:
				792	ldr r12, [r1], #4
				793	1: mov r4, r12
				794	ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
				795	PLD (r1, #64)
				796	subs r2, r2, #32
				797	ldrhs r12, [r1], #4
				798	orr r3, r3, r4, lsl #16
				799	mov r4, r4, lsr #16
				800	orr r4, r4, r5, lsl #16
				801	mov r5, r5, lsr #16
				802	orr r5, r5, r6, lsl #16
				803	mov r6, r6, lsr #16
				804	orr r6, r6, r7, lsl #16
				805	mov r7, r7, lsr #16
				806	orr r7, r7, r8, lsl #16
				807	mov r8, r8, lsr #16
				808	orr r8, r8, r9, lsl #16
				809	mov r9, r9, lsr #16
				810	orr r9, r9, r10, lsl #16
				811	mov r10, r10, lsr #16
				812	orr r10, r10, r11, lsl #16
				813	stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
				814	mov r3, r11, lsr #16
				815	bhs 1b
				816	b less_than_thirtytwo
				817
				818	loop8:
				819	ldr r12, [r1], #4
				820	1: mov r4, r12
				821	ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
				822	PLD (r1, #64)
				823	subs r2, r2, #32
				824	ldrhs r12, [r1], #4
				825	orr r3, r3, r4, lsl #24
				826	mov r4, r4, lsr #8
				827	orr r4, r4, r5, lsl #24
				828	mov r5, r5, lsr #8
				829	orr r5, r5, r6, lsl #24
				830	mov r6, r6, lsr #8
				831	orr r6, r6, r7, lsl #24
				832	mov r7, r7, lsr #8
				833	orr r7, r7, r8, lsl #24
				834	mov r8, r8, lsr #8
				835	orr r8, r8, r9, lsl #24
				836	mov r9, r9, lsr #8
				837	orr r9, r9, r10, lsl #24
				838	mov r10, r10, lsr #8
				839	orr r10, r10, r11, lsl #24
				840	stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
				841	mov r3, r11, lsr #8
				842	bhs 1b
				843	b less_than_thirtytwo
				844
				845	loop24:
				846	ldr r12, [r1], #4
				847	1: mov r4, r12
				848	ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
				849	PLD (r1, #64)
				850	subs r2, r2, #32
				851	ldrhs r12, [r1], #4
				852	orr r3, r3, r4, lsl #8
				853	mov r4, r4, lsr #24
				854	orr r4, r4, r5, lsl #8
				855	mov r5, r5, lsr #24
				856	orr r5, r5, r6, lsl #8
				857	mov r6, r6, lsr #24
				858	orr r6, r6, r7, lsl #8
				859	mov r7, r7, lsr #24
				860	orr r7, r7, r8, lsl #8
				861	mov r8, r8, lsr #24
				862	orr r8, r8, r9, lsl #8
				863	mov r9, r9, lsr #24
				864	orr r9, r9, r10, lsl #8
				865	mov r10, r10, lsr #24
				866	orr r10, r10, r11, lsl #8
				867	stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
				868	mov r3, r11, lsr #24
				869	bhs 1b
				870
				871
				872	less_than_thirtytwo:
				873	/* copy the last 0 to 31 bytes of the source */
				874	rsb r12, lr, #32 /* we corrupted r12, recompute it */
				875	add r2, r2, #32
				876	cmp r2, #4
				877	blo partial_word_tail
				878
				879	1: ldr r5, [r1], #4
				880	sub r2, r2, #4
				881	orr r4, r3, r5, lsl lr
				882	mov r3, r5, lsr r12
				883	str r4, [r0], #4
				884	cmp r2, #4
				885	bhs 1b
				886
				887	partial_word_tail:
				888	/* we have a partial word in the input buffer */
				889	movs r5, lr, lsl #(31-3)
				890	strmib r3, [r0], #1
				891	movmi r3, r3, lsr #8
				892	strcsb r3, [r0], #1
				893	movcs r3, r3, lsr #8
				894	strcsb r3, [r0], #1
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	895
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	896	/* Refill spilled registers from the stack. Don't update sp. */
				897	ldmfd sp, {r5-r11}
				898
				899	copy_last_3_and_return:
				900	movs r2, r2, lsl #31 /* copy remaining 0, 1, 2 or 3 bytes */
				901	ldrmib r2, [r1], #1
				902	ldrcsb r3, [r1], #1
				903	ldrcsb r12,[r1]
				904	strmib r2, [r0], #1
				905	strcsb r3, [r0], #1
				906	strcsb r12,[r0]
				907
				908	/* we're done! restore sp and spilled registers and return */
				909	add sp, sp, #28
				910	ldmfd sp!, {r0, r4, lr}
				911	bx lr
Kenny Root	420878c	2011-02-16 11:55:58 -0800	[diff] [blame]	912	END(memcpy)
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	913
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	914
				915	#endif /* __ARM_ARCH__ < 7 */