Blame - arch/xtensa/lib/memcopy.S - android_kernel_htc_msm8960

blob: ea59dcd038666da7af3004c68da1eb7d9964d9dd [file] [log] [blame]

Chris Zankel	249ac17	2005-06-23 22:01:20 -0700	[diff] [blame]	1	/*
				2	* arch/xtensa/lib/hal/memcopy.S -- Core HAL library functions
				3	* xthal_memcpy and xthal_bcopy
				4	*
				5	* This file is subject to the terms and conditions of the GNU General Public
				6	* License. See the file "COPYING" in the main directory of this archive
				7	* for more details.
				8	*
				9	* Copyright (C) 2002 - 2005 Tensilica Inc.
				10	*/
				11
Chris Zankel	367b811	2008-11-06 06:40:46 -0800	[diff] [blame]	12	#include <variant/core.h>
Chris Zankel	249ac17	2005-06-23 22:01:20 -0700	[diff] [blame]	13
				14	.macro src_b r, w0, w1
				15	#ifdef __XTENSA_EB__
				16	src \r, \w0, \w1
				17	#else
				18	src \r, \w1, \w0
				19	#endif
				20	.endm
				21
				22	.macro ssa8 r
				23	#ifdef __XTENSA_EB__
				24	ssa8b \r
				25	#else
				26	ssa8l \r
				27	#endif
				28	.endm
				29
				30
				31	/*
				32	* void memcpy(void dst, const void *src, size_t len);
				33	* void memmove(void dst, const void *src, size_t len);
				34	* void bcopy(const void src, void *dst, size_t len);
				35	*
				36	* This function is intended to do the same thing as the standard
				37	* library function memcpy() (or bcopy()) for most cases.
				38	* However, where the source and/or destination references
				39	* an instruction RAM or ROM or a data RAM or ROM, that
				40	* source and/or destination will always be accessed with
				41	* 32-bit load and store instructions (as required for these
				42	* types of devices).
				43	*
				44	* !!!!!!! XTFIXME:
				45	* !!!!!!! Handling of IRAM/IROM has not yet
				46	* !!!!!!! been implemented.
				47	*
				48	* The bcopy version is provided here to avoid the overhead
				49	* of an extra call, for callers that require this convention.
				50	*
				51	* The (general case) algorithm is as follows:
				52	* If destination is unaligned, align it by conditionally
				53	* copying 1 and 2 bytes.
				54	* If source is aligned,
				55	* do 16 bytes with a loop, and then finish up with
				56	* 8, 4, 2, and 1 byte copies conditional on the length;
				57	* else (if source is unaligned),
				58	* do the same, but use SRC to align the source data.
				59	* This code tries to use fall-through branches for the common
				60	* case of aligned source and destination and multiple
				61	* of 4 (or 8) length.
				62	*
				63	* Register use:
				64	* a0/ return address
				65	* a1/ stack pointer
				66	* a2/ return value
				67	* a3/ src
				68	* a4/ length
				69	* a5/ dst
				70	* a6/ tmp
				71	* a7/ tmp
				72	* a8/ tmp
				73	* a9/ tmp
				74	* a10/ tmp
				75	* a11/ tmp
				76	*/
				77
				78	.text
				79	.align 4
				80	.global bcopy
				81	.type bcopy,@function
				82	bcopy:
				83	entry sp, 16 # minimal stack frame
				84	# a2=src, a3=dst, a4=len
				85	mov a5, a3 # copy dst so that a2 is return value
				86	mov a3, a2
				87	mov a2, a5
				88	j .Lcommon # go to common code for memcpy+bcopy
				89
				90
				91	/*
				92	* Byte by byte copy
				93	*/
				94	.align 4
				95	.byte 0 # 1 mod 4 alignment for LOOPNEZ
				96	# (0 mod 4 alignment for LBEG)
				97	.Lbytecopy:
				98	#if XCHAL_HAVE_LOOPS
				99	loopnez a4, .Lbytecopydone
				100	#else /* !XCHAL_HAVE_LOOPS */
				101	beqz a4, .Lbytecopydone
				102	add a7, a3, a4 # a7 = end address for source
				103	#endif /* !XCHAL_HAVE_LOOPS */
				104	.Lnextbyte:
				105	l8ui a6, a3, 0
				106	addi a3, a3, 1
				107	s8i a6, a5, 0
				108	addi a5, a5, 1
				109	#if !XCHAL_HAVE_LOOPS
				110	blt a3, a7, .Lnextbyte
				111	#endif /* !XCHAL_HAVE_LOOPS */
				112	.Lbytecopydone:
				113	retw
				114
				115	/*
				116	* Destination is unaligned
				117	*/
				118
				119	.align 4
				120	.Ldst1mod2: # dst is only byte aligned
				121	_bltui a4, 7, .Lbytecopy # do short copies byte by byte
				122
				123	# copy 1 byte
				124	l8ui a6, a3, 0
				125	addi a3, a3, 1
				126	addi a4, a4, -1
				127	s8i a6, a5, 0
				128	addi a5, a5, 1
				129	_bbci.l a5, 1, .Ldstaligned # if dst is now aligned, then
				130	# return to main algorithm
				131	.Ldst2mod4: # dst 16-bit aligned
				132	# copy 2 bytes
				133	_bltui a4, 6, .Lbytecopy # do short copies byte by byte
				134	l8ui a6, a3, 0
				135	l8ui a7, a3, 1
				136	addi a3, a3, 2
				137	addi a4, a4, -2
				138	s8i a6, a5, 0
				139	s8i a7, a5, 1
				140	addi a5, a5, 2
				141	j .Ldstaligned # dst is now aligned, return to main algorithm
				142
				143	.align 4
				144	.global memcpy
				145	.type memcpy,@function
				146	memcpy:
				147	.global memmove
				148	.type memmove,@function
				149	memmove:
				150
				151	entry sp, 16 # minimal stack frame
				152	# a2/ dst, a3/ src, a4/ len
				153	mov a5, a2 # copy dst so that a2 is return value
				154	.Lcommon:
				155	_bbsi.l a2, 0, .Ldst1mod2 # if dst is 1 mod 2
				156	_bbsi.l a2, 1, .Ldst2mod4 # if dst is 2 mod 4
				157	.Ldstaligned: # return here from .Ldst?mod? once dst is aligned
				158	srli a7, a4, 4 # number of loop iterations with 16B
				159	# per iteration
				160	movi a8, 3 # if source is not aligned,
				161	_bany a3, a8, .Lsrcunaligned # then use shifting copy
				162	/*
				163	* Destination and source are word-aligned, use word copy.
				164	*/
				165	# copy 16 bytes per iteration for word-aligned dst and word-aligned src
				166	#if XCHAL_HAVE_LOOPS
				167	loopnez a7, .Loop1done
				168	#else /* !XCHAL_HAVE_LOOPS */
				169	beqz a7, .Loop1done
				170	slli a8, a7, 4
				171	add a8, a8, a3 # a8 = end of last 16B source chunk
				172	#endif /* !XCHAL_HAVE_LOOPS */
				173	.Loop1:
				174	l32i a6, a3, 0
				175	l32i a7, a3, 4
				176	s32i a6, a5, 0
				177	l32i a6, a3, 8
				178	s32i a7, a5, 4
				179	l32i a7, a3, 12
				180	s32i a6, a5, 8
				181	addi a3, a3, 16
				182	s32i a7, a5, 12
				183	addi a5, a5, 16
				184	#if !XCHAL_HAVE_LOOPS
				185	blt a3, a8, .Loop1
				186	#endif /* !XCHAL_HAVE_LOOPS */
				187	.Loop1done:
				188	bbci.l a4, 3, .L2
				189	# copy 8 bytes
				190	l32i a6, a3, 0
				191	l32i a7, a3, 4
				192	addi a3, a3, 8
				193	s32i a6, a5, 0
				194	s32i a7, a5, 4
				195	addi a5, a5, 8
				196	.L2:
				197	bbsi.l a4, 2, .L3
				198	bbsi.l a4, 1, .L4
				199	bbsi.l a4, 0, .L5
				200	retw
				201	.L3:
				202	# copy 4 bytes
				203	l32i a6, a3, 0
				204	addi a3, a3, 4
				205	s32i a6, a5, 0
				206	addi a5, a5, 4
				207	bbsi.l a4, 1, .L4
				208	bbsi.l a4, 0, .L5
				209	retw
				210	.L4:
				211	# copy 2 bytes
				212	l16ui a6, a3, 0
				213	addi a3, a3, 2
				214	s16i a6, a5, 0
				215	addi a5, a5, 2
				216	bbsi.l a4, 0, .L5
				217	retw
				218	.L5:
				219	# copy 1 byte
				220	l8ui a6, a3, 0
				221	s8i a6, a5, 0
				222	retw
				223
				224	/*
				225	* Destination is aligned, Source is unaligned
				226	*/
				227
				228	.align 4
				229	.Lsrcunaligned:
				230	_beqz a4, .Ldone # avoid loading anything for zero-length copies
				231	# copy 16 bytes per iteration for word-aligned dst and unaligned src
				232	ssa8 a3 # set shift amount from byte offset
				233	#define SIM_CHECKS_ALIGNMENT 1 /* set to 1 when running on ISS (simulator) with the
				234	lint or ferret client, or 0 to save a few cycles */
				235	#if XCHAL_UNALIGNED_LOAD_EXCEPTION \|\| SIM_CHECKS_ALIGNMENT
				236	and a11, a3, a8 # save unalignment offset for below
				237	sub a3, a3, a11 # align a3
				238	#endif
				239	l32i a6, a3, 0 # load first word
				240	#if XCHAL_HAVE_LOOPS
				241	loopnez a7, .Loop2done
				242	#else /* !XCHAL_HAVE_LOOPS */
				243	beqz a7, .Loop2done
				244	slli a10, a7, 4
				245	add a10, a10, a3 # a10 = end of last 16B source chunk
				246	#endif /* !XCHAL_HAVE_LOOPS */
				247	.Loop2:
				248	l32i a7, a3, 4
				249	l32i a8, a3, 8
				250	src_b a6, a6, a7
				251	s32i a6, a5, 0
				252	l32i a9, a3, 12
				253	src_b a7, a7, a8
				254	s32i a7, a5, 4
				255	l32i a6, a3, 16
				256	src_b a8, a8, a9
				257	s32i a8, a5, 8
				258	addi a3, a3, 16
				259	src_b a9, a9, a6
				260	s32i a9, a5, 12
				261	addi a5, a5, 16
				262	#if !XCHAL_HAVE_LOOPS
				263	blt a3, a10, .Loop2
				264	#endif /* !XCHAL_HAVE_LOOPS */
				265	.Loop2done:
				266	bbci.l a4, 3, .L12
				267	# copy 8 bytes
				268	l32i a7, a3, 4
				269	l32i a8, a3, 8
				270	src_b a6, a6, a7
				271	s32i a6, a5, 0
				272	addi a3, a3, 8
				273	src_b a7, a7, a8
				274	s32i a7, a5, 4
				275	addi a5, a5, 8
				276	mov a6, a8
				277	.L12:
				278	bbci.l a4, 2, .L13
				279	# copy 4 bytes
				280	l32i a7, a3, 4
				281	addi a3, a3, 4
				282	src_b a6, a6, a7
				283	s32i a6, a5, 0
				284	addi a5, a5, 4
				285	mov a6, a7
				286	.L13:
				287	#if XCHAL_UNALIGNED_LOAD_EXCEPTION \|\| SIM_CHECKS_ALIGNMENT
				288	add a3, a3, a11 # readjust a3 with correct misalignment
				289	#endif
				290	bbsi.l a4, 1, .L14
				291	bbsi.l a4, 0, .L15
				292	.Ldone: retw
				293	.L14:
				294	# copy 2 bytes
				295	l8ui a6, a3, 0
				296	l8ui a7, a3, 1
				297	addi a3, a3, 2
				298	s8i a6, a5, 0
				299	s8i a7, a5, 1
				300	addi a5, a5, 2
				301	bbsi.l a4, 0, .L15
				302	retw
				303	.L15:
				304	# copy 1 byte
				305	l8ui a6, a3, 0
				306	s8i a6, a5, 0
				307	retw
				308
				309	/*
				310	* Local Variables:
				311	* mode:fundamental
				312	* comment-start: "# "
				313	* comment-start-skip: "# *"
				314	* End:
				315	*/