Blame - libc/arch-arm/bionic/memcpy.S - android_bionic

blob: d13c605a05fac5332953a09c04a71d6dfe52918a [file] [log] [blame]

The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	1	/*
				2	* Copyright (C) 2008 The Android Open Source Project
				3	* All rights reserved.
				4	*
Harshad Bhutada	c822147	2011-05-05 18:27:02 +0530	[diff] [blame]	5	* Copyright (c) 2009-2011, Code Aurora Forum. All rights reserved.
				6	*
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	7	* Redistribution and use in source and binary forms, with or without
				8	* modification, are permitted provided that the following conditions
				9	* are met:
				10	* * Redistributions of source code must retain the above copyright
				11	* notice, this list of conditions and the following disclaimer.
				12	* * Redistributions in binary form must reproduce the above copyright
				13	* notice, this list of conditions and the following disclaimer in
				14	* the documentation and/or other materials provided with the
				15	* distribution.
				16	*
				17	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				18	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				19	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
				20	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
				21	* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
				22	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
				23	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
				24	* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
				25	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
				26	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
				27	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
				28	* SUCH DAMAGE.
				29	*/
				30
				31	#include <machine/cpu-features.h>
Kenny Root	420878c	2011-02-16 11:55:58 -0800	[diff] [blame]	32	#include <machine/asm.h>
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	33
Colin Cross	ecede40	2010-03-09 16:23:51 -0800	[diff] [blame]	34	#if defined(__ARM_NEON__)
Harshad Bhutada	c822147	2011-05-05 18:27:02 +0530	[diff] [blame]	35	#if defined(SCORPION_NEON_OPTIMIZATION)
				36	/*
				37	* These can be overridden in:
				38	* device/<vendor>/<board>/BoardConfig.mk
				39	* by setting the following:
				40	* TARGET_USE_SCORPION_BIONIC_OPTIMIZATION := true
				41	* TARGET_USE_SCORPION_PLD_SET := true
				42	* TARGET_SCORPION_BIONIC_PLDOFFS := <pldoffset>
				43	* TARGET_SCORPION_BIONIC_PLDSIZE := <pldsize>
				44	*/
				45	#ifndef PLDOFFS
				46	#define PLDOFFS (6)
				47	#endif
				48	#ifndef PLDSIZE
				49	#define PLDSIZE (128) /* L2 cache line size */
				50	#endif
				51	.code 32
				52	.align 5
				53	.globl memcpy
				54	.func
				55	memcpy:
				56	push {r0}
				57	cmp r2, #4
				58	blt .Lneon_lt4
				59	cmp r2, #16
				60	blt .Lneon_lt16
				61	cmp r2, #32
				62	blt .Lneon_16
				63	cmp r2, #128
				64	blt .Lneon_copy_32_a
				65	/* Copy blocks of 128-bytes (word-aligned) at a time*/
				66	/* Code below is optimized for PLDSIZE=128 only */
				67	mov r12, r2, lsr #7
				68	cmp r12, #PLDOFFS
				69	ble .Lneon_copy_128_loop_nopld
				70	sub r12, #PLDOFFS
				71	pld [r1, #(PLDOFFS-1)*PLDSIZE]
				72	.Lneon_copy_128_loop_outer:
				73	pld [r1, #(PLDOFFS*PLDSIZE)]
				74	vld1.32 {q0, q1}, [r1]!
				75	vld1.32 {q2, q3}, [r1]!
				76	vld1.32 {q8, q9}, [r1]!
				77	vld1.32 {q10, q11}, [r1]!
				78	subs r12, r12, #1
				79	vst1.32 {q0, q1}, [r0]!
				80	vst1.32 {q2, q3}, [r0]!
				81	vst1.32 {q8, q9}, [r0]!
				82	vst1.32 {q10, q11}, [r0]!
				83	bne .Lneon_copy_128_loop_outer
				84	mov r12, #PLDOFFS
				85	.Lneon_copy_128_loop_nopld:
				86	vld1.32 {q0, q1}, [r1]!
				87	vld1.32 {q2, q3}, [r1]!
				88	vld1.32 {q8, q9}, [r1]!
				89	vld1.32 {q10, q11}, [r1]!
				90	subs r12, r12, #1
				91	vst1.32 {q0, q1}, [r0]!
				92	vst1.32 {q2, q3}, [r0]!
				93	vst1.32 {q8, q9}, [r0]!
				94	vst1.32 {q10, q11}, [r0]!
				95	bne .Lneon_copy_128_loop_nopld
				96	ands r2, r2, #0x7f
				97	beq .Lneon_exit
				98	cmp r2, #32
				99	blt .Lneon_16
				100	nop
				101	/* Copy blocks of 32-bytes (word aligned) at a time*/
				102	.Lneon_copy_32_a:
				103	mov r12, r2, lsr #5
				104	.Lneon_copy_32_loop_a:
				105	vld1.32 {q0,q1}, [r1]!
				106	subs r12, r12, #1
				107	vst1.32 {q0,q1}, [r0]!
				108	bne .Lneon_copy_32_loop_a
				109	ands r2, r2, #0x1f
				110	beq .Lneon_exit
				111	.Lneon_16:
				112	subs r2, r2, #16
				113	blt .Lneon_lt16
				114	vld1.32 {q8}, [r1]!
				115	vst1.32 {q8}, [r0]!
				116	beq .Lneon_exit
				117	.Lneon_lt16:
				118	movs r12, r2, lsl #29
				119	bcc .Lneon_skip8
				120	ldr r3, [r1], #4
				121	ldr r12, [r1], #4
				122	str r3, [r0], #4
				123	str r12, [r0], #4
				124	.Lneon_skip8:
				125	bpl .Lneon_lt4
				126	ldr r3, [r1], #4
				127	str r3, [r0], #4
				128	.Lneon_lt4:
				129	movs r2, r2, lsl #31
				130	bcc .Lneon_lt2
				131	ldrh r3, [r1], #2
				132	strh r3, [r0], #2
				133	.Lneon_lt2:
				134	bpl .Lneon_exit
				135	ldrb r12, [r1]
				136	strb r12, [r0]
				137	.Lneon_exit:
				138	pop {r0}
				139	bx lr
				140	.endfunc
				141	.end
				142	#else /* !SCORPION_NEON_OPTIMIZATION */
Chitti Babu Theegala	f554a19	2011-11-18 10:25:58 +0530	[diff] [blame^]	143	#if defined(CORTEX_CACHE_LINE_32)
				144	/*
				145	*This can be enabled by setting flag
				146	*TARGET_CORTEX_CACHE_LINE_32 in
				147	*device/<vendor>/<board>/BoardConfig.mk
				148	*/
				149	.text
				150	.fpu neon
				151
				152	.global memcpy
				153	.type memcpy, %function
				154	.align 4
				155
				156	/* a prefetch distance of 4 cache-lines works best experimentally */
				157	#define CACHE_LINE_SIZE 32
				158	memcpy:
				159	.fnstart
				160	.save {r0, lr}
				161	stmfd sp!, {r0, lr}
				162
				163	/* start preloading as early as possible */
				164	pld [r1, #(CACHE_LINE_SIZE*0)]
				165	pld [r1, #(CACHE_LINE_SIZE*1)]
				166
				167	/* do we have at least 16-bytes to copy (needed for alignment below) */
				168	cmp r2, #16
				169	blo 5f
				170
				171	/* align destination to half cache-line for the write-buffer */
				172	rsb r3, r0, #0
				173	ands r3, r3, #0xF
				174	beq 0f
				175
				176	/* copy up to 15-bytes (count in r3) */
				177	sub r2, r2, r3
				178	movs ip, r3, lsl #31
				179	ldrmib lr, [r1], #1
				180	strmib lr, [r0], #1
				181	ldrcsb ip, [r1], #1
				182	ldrcsb lr, [r1], #1
				183	strcsb ip, [r0], #1
				184	strcsb lr, [r0], #1
				185	movs ip, r3, lsl #29
				186	bge 1f
				187	// copies 4 bytes, destination 32-bits aligned
				188	vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
				189	vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
				190	1: bcc 2f
				191	// copies 8 bytes, destination 64-bits aligned
				192	vld1.8 {d0}, [r1]!
				193	vst1.8 {d0}, [r0, :64]!
				194	2:
				195
				196	0: /* preload immediately the next cache line, which we may need */
				197	pld [r1, #(CACHE_LINE_SIZE*0)]
				198	pld [r1, #(CACHE_LINE_SIZE*1)]
				199
				200	/* make sure we have at least 128 bytes to copy */
				201	subs r2, r2, #128
				202	blo 2f
				203
				204	/* preload all the cache lines we need.
				205	* NOTE: the number of pld below depends on PREFETCH_DISTANCE,
				206	* ideally would would increase the distance in the main loop to
				207	* avoid the goofy code below. In practice this doesn't seem to make
				208	* a big difference.
				209	*/
				210	pld [r1, #(CACHE_LINE_SIZE*2)]
				211	pld [r1, #(CACHE_LINE_SIZE*3)]
				212	pld [r1, #(CACHE_LINE_SIZE*4)]
				213
				214	.align 3
				215	1: /* The main loop copies 128 bytes at a time */
				216	subs r2, r2, #128
				217	vld1.8 {d0 - d3}, [r1]!
				218	vld1.8 {d4 - d7}, [r1]!
				219	pld [r1, #(CACHE_LINE_SIZE*1)]
				220	pld [r1, #(CACHE_LINE_SIZE*2)]
				221	vld1.8 {d16 - d19}, [r1]!
				222	vld1.8 {d20 - d23}, [r1]!
				223	pld [r1, #(CACHE_LINE_SIZE*1)]
				224	pld [r1, #(CACHE_LINE_SIZE*2)]
				225	vst1.8 {d0 - d3}, [r0, :128]!
				226	vst1.8 {d4 - d7}, [r0, :128]!
				227	vst1.8 {d16 - d19}, [r0, :128]!
				228	vst1.8 {d20 - d23}, [r0, :128]!
				229	bhs 1b
				230
				231	2: /* fix-up the remaining count and make sure we have >= 32 bytes left */
				232	add r2, r2, #128
				233	subs r2, r2, #32
				234	blo 4f
				235
				236	3: /* 32 bytes at a time. These cache lines were already preloaded */
				237	vld1.8 {d0 - d3}, [r1]!
				238	subs r2, r2, #32
				239	vst1.8 {d0 - d3}, [r0, :128]!
				240	bhs 3b
				241
				242	4: /* less than 32 left */
				243	add r2, r2, #32
				244	tst r2, #0x10
				245	beq 5f
				246	// copies 16 bytes, 128-bits aligned
				247	vld1.8 {d0, d1}, [r1]!
				248	vst1.8 {d0, d1}, [r0, :128]!
				249
				250	5: /* copy up to 15-bytes (count in r2) */
				251	movs ip, r2, lsl #29
				252	bcc 1f
				253	vld1.8 {d0}, [r1]!
				254	vst1.8 {d0}, [r0]!
				255	1: bge 2f
				256	vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
				257	vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!
				258	2: movs ip, r2, lsl #31
				259	ldrmib r3, [r1], #1
				260	ldrcsb ip, [r1], #1
				261	ldrcsb lr, [r1], #1
				262	strmib r3, [r0], #1
				263	strcsb ip, [r0], #1
				264	strcsb lr, [r0], #1
				265
				266	ldmfd sp!, {r0, lr}
				267	bx lr
				268	.fnend
				269	#else /!CORTEX_CACHE_LINE_32/
				270
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	271	.text
				272	.fpu neon
				273
Mathias Agopian	199f9d9	2009-10-28 02:54:37 -0700	[diff] [blame]	274	/* a prefetch distance of 4 cache-lines works best experimentally */
				275	#define CACHE_LINE_SIZE 64
				276	#define PREFETCH_DISTANCE (CACHE_LINE_SIZE*4)
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	277
Evgeniy Stepanov	487b613	2011-10-04 14:22:15 +0400	[diff] [blame]	278	ENTRY(memcpy)
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	279	.save {r0, lr}
				280	stmfd sp!, {r0, lr}
				281
				282	/* start preloading as early as possible */
Mathias Agopian	199f9d9	2009-10-28 02:54:37 -0700	[diff] [blame]	283	pld [r1, #(CACHE_LINE_SIZE*0)]
				284	pld [r1, #(CACHE_LINE_SIZE*1)]
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	285
				286	/* do we have at least 16-bytes to copy (needed for alignment below) */
				287	cmp r2, #16
				288	blo 5f
				289
				290	/* align destination to half cache-line for the write-buffer */
				291	rsb r3, r0, #0
				292	ands r3, r3, #0xF
				293	beq 0f
				294
				295	/* copy up to 15-bytes (count in r3) */
				296	sub r2, r2, r3
				297	movs ip, r3, lsl #31
				298	ldrmib lr, [r1], #1
				299	strmib lr, [r0], #1
				300	ldrcsb ip, [r1], #1
				301	ldrcsb lr, [r1], #1
				302	strcsb ip, [r0], #1
				303	strcsb lr, [r0], #1
				304	movs ip, r3, lsl #29
				305	bge 1f
				306	// copies 4 bytes, destination 32-bits aligned
				307	vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
				308	vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
				309	1: bcc 2f
				310	// copies 8 bytes, destination 64-bits aligned
				311	vld1.8 {d0}, [r1]!
				312	vst1.8 {d0}, [r0, :64]!
				313	2:
				314
				315	0: /* preload immediately the next cache line, which we may need */
Mathias Agopian	199f9d9	2009-10-28 02:54:37 -0700	[diff] [blame]	316	pld [r1, #(CACHE_LINE_SIZE*0)]
				317	pld [r1, #(CACHE_LINE_SIZE*1)]
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	318
Mathias Agopian	199f9d9	2009-10-28 02:54:37 -0700	[diff] [blame]	319	/* make sure we have at least 64 bytes to copy */
				320	subs r2, r2, #64
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	321	blo 2f
				322
				323	/* preload all the cache lines we need.
				324	* NOTE: the number of pld below depends on PREFETCH_DISTANCE,
				325	* ideally would would increase the distance in the main loop to
				326	* avoid the goofy code below. In practice this doesn't seem to make
				327	* a big difference.
				328	*/
Mathias Agopian	199f9d9	2009-10-28 02:54:37 -0700	[diff] [blame]	329	pld [r1, #(CACHE_LINE_SIZE*2)]
				330	pld [r1, #(CACHE_LINE_SIZE*3)]
				331	pld [r1, #(PREFETCH_DISTANCE)]
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	332
Mathias Agopian	199f9d9	2009-10-28 02:54:37 -0700	[diff] [blame]	333	1: /* The main loop copies 64 bytes at a time */
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	334	vld1.8 {d0 - d3}, [r1]!
				335	vld1.8 {d4 - d7}, [r1]!
Mathias Agopian	199f9d9	2009-10-28 02:54:37 -0700	[diff] [blame]	336	pld [r1, #(PREFETCH_DISTANCE)]
				337	subs r2, r2, #64
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	338	vst1.8 {d0 - d3}, [r0, :128]!
				339	vst1.8 {d4 - d7}, [r0, :128]!
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	340	bhs 1b
				341
				342	2: /* fix-up the remaining count and make sure we have >= 32 bytes left */
Mathias Agopian	199f9d9	2009-10-28 02:54:37 -0700	[diff] [blame]	343	add r2, r2, #64
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	344	subs r2, r2, #32
				345	blo 4f
				346
				347	3: /* 32 bytes at a time. These cache lines were already preloaded */
				348	vld1.8 {d0 - d3}, [r1]!
				349	subs r2, r2, #32
				350	vst1.8 {d0 - d3}, [r0, :128]!
				351	bhs 3b
				352
				353	4: /* less than 32 left */
				354	add r2, r2, #32
				355	tst r2, #0x10
				356	beq 5f
				357	// copies 16 bytes, 128-bits aligned
				358	vld1.8 {d0, d1}, [r1]!
				359	vst1.8 {d0, d1}, [r0, :128]!
				360
				361	5: /* copy up to 15-bytes (count in r2) */
				362	movs ip, r2, lsl #29
				363	bcc 1f
				364	vld1.8 {d0}, [r1]!
				365	vst1.8 {d0}, [r0]!
				366	1: bge 2f
				367	vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
				368	vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!
				369	2: movs ip, r2, lsl #31
				370	ldrmib r3, [r1], #1
				371	ldrcsb ip, [r1], #1
				372	ldrcsb lr, [r1], #1
				373	strmib r3, [r0], #1
				374	strcsb ip, [r0], #1
				375	strcsb lr, [r0], #1
				376
				377	ldmfd sp!, {r0, lr}
				378	bx lr
Kenny Root	420878c	2011-02-16 11:55:58 -0800	[diff] [blame]	379	END(memcpy)
Chitti Babu Theegala	f554a19	2011-11-18 10:25:58 +0530	[diff] [blame^]	380	#endif /* CORTEX_CACHE_LINE_32 */
Harshad Bhutada	c822147	2011-05-05 18:27:02 +0530	[diff] [blame]	381	#endif /* !SCORPION_NEON_OPTIMIZATION */
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	382	#else /* __ARM_ARCH__ < 7 */
				383
				384
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	385	/*
				386	* Optimized memcpy() for ARM.
				387	*
				388	* note that memcpy() always returns the destination pointer,
				389	* so we have to preserve R0.
				390	*/
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	391
Kenny Root	420878c	2011-02-16 11:55:58 -0800	[diff] [blame]	392	ENTRY(memcpy)
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	393	/* The stack must always be 64-bits aligned to be compliant with the
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	394	* ARM ABI. Since we have to save R0, we might as well save R4
				395	* which we can use for better pipelining of the reads below
				396	*/
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	397	.save {r0, r4, lr}
				398	stmfd sp!, {r0, r4, lr}
				399	/* Making room for r5-r11 which will be spilled later */
				400	.pad #28
				401	sub sp, sp, #28
				402
				403	// preload the destination because we'll align it to a cache line
				404	// with small writes. Also start the source "pump".
				405	PLD (r0, #0)
				406	PLD (r1, #0)
				407	PLD (r1, #32)
				408
				409	/* it simplifies things to take care of len<4 early */
				410	cmp r2, #4
				411	blo copy_last_3_and_return
				412
				413	/* compute the offset to align the source
				414	* offset = (4-(src&3))&3 = -src & 3
				415	*/
				416	rsb r3, r1, #0
				417	ands r3, r3, #3
				418	beq src_aligned
				419
				420	/* align source to 32 bits. We need to insert 2 instructions between
				421	* a ldr[b\|h] and str[b\|h] because byte and half-word instructions
				422	* stall 2 cycles.
				423	*/
				424	movs r12, r3, lsl #31
				425	sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */
				426	ldrmib r3, [r1], #1
				427	ldrcsb r4, [r1], #1
				428	ldrcsb r12,[r1], #1
				429	strmib r3, [r0], #1
				430	strcsb r4, [r0], #1
				431	strcsb r12,[r0], #1
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	432
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	433	src_aligned:
				434
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	435	/* see if src and dst are aligned together (congruent) */
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	436	eor r12, r0, r1
				437	tst r12, #3
				438	bne non_congruent
				439
				440	/* Use post-incriment mode for stm to spill r5-r11 to reserved stack
				441	* frame. Don't update sp.
				442	*/
				443	stmea sp, {r5-r11}
				444
				445	/* align the destination to a cache-line */
				446	rsb r3, r0, #0
				447	ands r3, r3, #0x1C
				448	beq congruent_aligned32
				449	cmp r3, r2
				450	andhi r3, r2, #0x1C
				451
				452	/* conditionnaly copies 0 to 7 words (length in r3) */
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	453	movs r12, r3, lsl #28
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	454	ldmcsia r1!, {r4, r5, r6, r7} /* 16 bytes */
				455	ldmmiia r1!, {r8, r9} /* 8 bytes */
				456	stmcsia r0!, {r4, r5, r6, r7}
				457	stmmiia r0!, {r8, r9}
				458	tst r3, #0x4
				459	ldrne r10,[r1], #4 /* 4 bytes */
				460	strne r10,[r0], #4
				461	sub r2, r2, r3
				462
				463	congruent_aligned32:
				464	/*
				465	* here source is aligned to 32 bytes.
				466	*/
				467
				468	cached_aligned32:
				469	subs r2, r2, #32
				470	blo less_than_32_left
				471
				472	/*
				473	* We preload a cache-line up to 64 bytes ahead. On the 926, this will
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	474	* stall only until the requested world is fetched, but the linefill
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	475	* continues in the the background.
				476	* While the linefill is going, we write our previous cache-line
				477	* into the write-buffer (which should have some free space).
				478	* When the linefill is done, the writebuffer will
				479	* start dumping its content into memory
				480	*
				481	* While all this is going, we then load a full cache line into
				482	* 8 registers, this cache line should be in the cache by now
				483	* (or partly in the cache).
				484	*
				485	* This code should work well regardless of the source/dest alignment.
				486	*
				487	*/
				488
				489	// Align the preload register to a cache-line because the cpu does
				490	// "critical word first" (the first word requested is loaded first).
				491	bic r12, r1, #0x1F
				492	add r12, r12, #64
				493
				494	1: ldmia r1!, { r4-r11 }
				495	PLD (r12, #64)
				496	subs r2, r2, #32
				497
				498	// NOTE: if r12 is more than 64 ahead of r1, the following ldrhi
				499	// for ARM9 preload will not be safely guarded by the preceding subs.
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	500	// When it is safely guarded the only possibility to have SIGSEGV here
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	501	// is because the caller overstates the length.
				502	ldrhi r3, [r12], #32 /* cheap ARM9 preload */
				503	stmia r0!, { r4-r11 }
				504	bhs 1b
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	505
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	506	add r2, r2, #32
				507
				508
				509
				510
				511	less_than_32_left:
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	512	/*
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	513	* less than 32 bytes left at this point (length in r2)
				514	*/
				515
				516	/* skip all this if there is nothing to do, which should
				517	* be a common case (if not executed the code below takes
				518	* about 16 cycles)
				519	*/
				520	tst r2, #0x1F
				521	beq 1f
				522
				523	/* conditionnaly copies 0 to 31 bytes */
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	524	movs r12, r2, lsl #28
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	525	ldmcsia r1!, {r4, r5, r6, r7} /* 16 bytes */
				526	ldmmiia r1!, {r8, r9} /* 8 bytes */
				527	stmcsia r0!, {r4, r5, r6, r7}
				528	stmmiia r0!, {r8, r9}
				529	movs r12, r2, lsl #30
				530	ldrcs r3, [r1], #4 /* 4 bytes */
				531	ldrmih r4, [r1], #2 /* 2 bytes */
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	532	strcs r3, [r0], #4
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	533	strmih r4, [r0], #2
				534	tst r2, #0x1
				535	ldrneb r3, [r1] /* last byte */
				536	strneb r3, [r0]
				537
				538	/* we're done! restore everything and return */
				539	1: ldmfd sp!, {r5-r11}
				540	ldmfd sp!, {r0, r4, lr}
				541	bx lr
				542
				543	/********************************************************************/
				544
				545	non_congruent:
				546	/*
				547	* here source is aligned to 4 bytes
				548	* but destination is not.
				549	*
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	550	* in the code below r2 is the number of bytes read
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	551	* (the number of bytes written is always smaller, because we have
				552	* partial words in the shift queue)
				553	*/
				554	cmp r2, #4
				555	blo copy_last_3_and_return
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	556
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	557	/* Use post-incriment mode for stm to spill r5-r11 to reserved stack
				558	* frame. Don't update sp.
				559	*/
				560	stmea sp, {r5-r11}
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	561
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	562	/* compute shifts needed to align src to dest */
				563	rsb r5, r0, #0
				564	and r5, r5, #3 /* r5 = # bytes in partial words */
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	565	mov r12, r5, lsl #3 /* r12 = right */
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	566	rsb lr, r12, #32 /* lr = left */
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	567
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	568	/* read the first word */
				569	ldr r3, [r1], #4
				570	sub r2, r2, #4
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	571
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	572	/* write a partial word (0 to 3 bytes), such that destination
				573	* becomes aligned to 32 bits (r5 = nb of words to copy for alignment)
				574	*/
				575	movs r5, r5, lsl #31
				576	strmib r3, [r0], #1
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	577	movmi r3, r3, lsr #8
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	578	strcsb r3, [r0], #1
				579	movcs r3, r3, lsr #8
				580	strcsb r3, [r0], #1
				581	movcs r3, r3, lsr #8
				582
				583	cmp r2, #4
				584	blo partial_word_tail
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	585
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	586	/* Align destination to 32 bytes (cache line boundary) */
				587	1: tst r0, #0x1c
				588	beq 2f
				589	ldr r5, [r1], #4
				590	sub r2, r2, #4
				591	orr r4, r3, r5, lsl lr
				592	mov r3, r5, lsr r12
				593	str r4, [r0], #4
				594	cmp r2, #4
				595	bhs 1b
				596	blo partial_word_tail
				597
				598	/* copy 32 bytes at a time */
				599	2: subs r2, r2, #32
				600	blo less_than_thirtytwo
				601
				602	/* Use immediate mode for the shifts, because there is an extra cycle
				603	* for register shifts, which could account for up to 50% of
				604	* performance hit.
				605	*/
				606
				607	cmp r12, #24
				608	beq loop24
				609	cmp r12, #8
				610	beq loop8
				611
				612	loop16:
				613	ldr r12, [r1], #4
				614	1: mov r4, r12
				615	ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
				616	PLD (r1, #64)
				617	subs r2, r2, #32
				618	ldrhs r12, [r1], #4
				619	orr r3, r3, r4, lsl #16
				620	mov r4, r4, lsr #16
				621	orr r4, r4, r5, lsl #16
				622	mov r5, r5, lsr #16
				623	orr r5, r5, r6, lsl #16
				624	mov r6, r6, lsr #16
				625	orr r6, r6, r7, lsl #16
				626	mov r7, r7, lsr #16
				627	orr r7, r7, r8, lsl #16
				628	mov r8, r8, lsr #16
				629	orr r8, r8, r9, lsl #16
				630	mov r9, r9, lsr #16
				631	orr r9, r9, r10, lsl #16
				632	mov r10, r10, lsr #16
				633	orr r10, r10, r11, lsl #16
				634	stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
				635	mov r3, r11, lsr #16
				636	bhs 1b
				637	b less_than_thirtytwo
				638
				639	loop8:
				640	ldr r12, [r1], #4
				641	1: mov r4, r12
				642	ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
				643	PLD (r1, #64)
				644	subs r2, r2, #32
				645	ldrhs r12, [r1], #4
				646	orr r3, r3, r4, lsl #24
				647	mov r4, r4, lsr #8
				648	orr r4, r4, r5, lsl #24
				649	mov r5, r5, lsr #8
				650	orr r5, r5, r6, lsl #24
				651	mov r6, r6, lsr #8
				652	orr r6, r6, r7, lsl #24
				653	mov r7, r7, lsr #8
				654	orr r7, r7, r8, lsl #24
				655	mov r8, r8, lsr #8
				656	orr r8, r8, r9, lsl #24
				657	mov r9, r9, lsr #8
				658	orr r9, r9, r10, lsl #24
				659	mov r10, r10, lsr #8
				660	orr r10, r10, r11, lsl #24
				661	stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
				662	mov r3, r11, lsr #8
				663	bhs 1b
				664	b less_than_thirtytwo
				665
				666	loop24:
				667	ldr r12, [r1], #4
				668	1: mov r4, r12
				669	ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
				670	PLD (r1, #64)
				671	subs r2, r2, #32
				672	ldrhs r12, [r1], #4
				673	orr r3, r3, r4, lsl #8
				674	mov r4, r4, lsr #24
				675	orr r4, r4, r5, lsl #8
				676	mov r5, r5, lsr #24
				677	orr r5, r5, r6, lsl #8
				678	mov r6, r6, lsr #24
				679	orr r6, r6, r7, lsl #8
				680	mov r7, r7, lsr #24
				681	orr r7, r7, r8, lsl #8
				682	mov r8, r8, lsr #24
				683	orr r8, r8, r9, lsl #8
				684	mov r9, r9, lsr #24
				685	orr r9, r9, r10, lsl #8
				686	mov r10, r10, lsr #24
				687	orr r10, r10, r11, lsl #8
				688	stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
				689	mov r3, r11, lsr #24
				690	bhs 1b
				691
				692
				693	less_than_thirtytwo:
				694	/* copy the last 0 to 31 bytes of the source */
				695	rsb r12, lr, #32 /* we corrupted r12, recompute it */
				696	add r2, r2, #32
				697	cmp r2, #4
				698	blo partial_word_tail
				699
				700	1: ldr r5, [r1], #4
				701	sub r2, r2, #4
				702	orr r4, r3, r5, lsl lr
				703	mov r3, r5, lsr r12
				704	str r4, [r0], #4
				705	cmp r2, #4
				706	bhs 1b
				707
				708	partial_word_tail:
				709	/* we have a partial word in the input buffer */
				710	movs r5, lr, lsl #(31-3)
				711	strmib r3, [r0], #1
				712	movmi r3, r3, lsr #8
				713	strcsb r3, [r0], #1
				714	movcs r3, r3, lsr #8
				715	strcsb r3, [r0], #1
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	716
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	717	/* Refill spilled registers from the stack. Don't update sp. */
				718	ldmfd sp, {r5-r11}
				719
				720	copy_last_3_and_return:
				721	movs r2, r2, lsl #31 /* copy remaining 0, 1, 2 or 3 bytes */
				722	ldrmib r2, [r1], #1
				723	ldrcsb r3, [r1], #1
				724	ldrcsb r12,[r1]
				725	strmib r2, [r0], #1
				726	strcsb r3, [r0], #1
				727	strcsb r12,[r0]
				728
				729	/* we're done! restore sp and spilled registers and return */
				730	add sp, sp, #28
				731	ldmfd sp!, {r0, r4, lr}
				732	bx lr
Kenny Root	420878c	2011-02-16 11:55:58 -0800	[diff] [blame]	733	END(memcpy)
The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame]	734
Mathias Agopian	ee223d0	2009-09-27 17:46:43 -0700	[diff] [blame]	735
				736	#endif /* __ARM_ARCH__ < 7 */