Blame - arch/tile/lib/memcpy_32.S - android_kernel_oneplus_msm8996

blob: 30c3b7ebb55d54a7017657034e30f74c00c506e0 [file] [log] [blame]

Chris Metcalf	867e359	2010-05-28 23:09:12 -0400	[diff] [blame]	1	/*
				2	* Copyright 2010 Tilera Corporation. All Rights Reserved.
				3	*
				4	* This program is free software; you can redistribute it and/or
				5	* modify it under the terms of the GNU General Public License
				6	* as published by the Free Software Foundation, version 2.
				7	*
				8	* This program is distributed in the hope that it will be useful, but
				9	* WITHOUT ANY WARRANTY; without even the implied warranty of
				10	* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
				11	* NON INFRINGEMENT. See the GNU General Public License for
				12	* more details.
				13	*
				14	* This file shares the implementation of the userspace memcpy and
				15	* the kernel's memcpy, copy_to_user and copy_from_user.
				16	*/
				17
				18	#include <arch/chip.h>
				19
Chris Metcalf	867e359	2010-05-28 23:09:12 -0400	[diff] [blame]	20
				21	#include <linux/linkage.h>
				22
				23	/* On TILE64, we wrap these functions via arch/tile/lib/memcpy_tile64.c */
				24	#if !CHIP_HAS_COHERENT_LOCAL_CACHE()
				25	#define memcpy __memcpy_asm
				26	#define __copy_to_user_inatomic __copy_to_user_inatomic_asm
				27	#define __copy_from_user_inatomic __copy_from_user_inatomic_asm
				28	#define __copy_from_user_zeroing __copy_from_user_zeroing_asm
				29	#endif
				30
				31	#define IS_MEMCPY 0
				32	#define IS_COPY_FROM_USER 1
				33	#define IS_COPY_FROM_USER_ZEROING 2
				34	#define IS_COPY_TO_USER -1
				35
				36	.section .text.memcpy_common, "ax"
				37	.align 64
				38
				39	/* Use this to preface each bundle that can cause an exception so
				40	* the kernel can clean up properly. The special cleanup code should
				41	* not use these, since it knows what it is doing.
				42	*/
				43	#define EX \
				44	.pushsection __ex_table, "a"; \
				45	.word 9f, memcpy_common_fixup; \
				46	.popsection; \
				47	9
				48
				49
				50	/* __copy_from_user_inatomic takes the kernel target address in r0,
				51	* the user source in r1, and the bytes to copy in r2.
				52	* It returns the number of uncopiable bytes (hopefully zero) in r0.
				53	*/
				54	ENTRY(__copy_from_user_inatomic)
				55	.type __copy_from_user_inatomic, @function
				56	FEEDBACK_ENTER_EXPLICIT(__copy_from_user_inatomic, \
				57	.text.memcpy_common, \
				58	.Lend_memcpy_common - __copy_from_user_inatomic)
				59	{ movei r29, IS_COPY_FROM_USER; j memcpy_common }
				60	.size __copy_from_user_inatomic, . - __copy_from_user_inatomic
				61
				62	/* __copy_from_user_zeroing is like __copy_from_user_inatomic, but
				63	* any uncopiable bytes are zeroed in the target.
				64	*/
				65	ENTRY(__copy_from_user_zeroing)
				66	.type __copy_from_user_zeroing, @function
				67	FEEDBACK_REENTER(__copy_from_user_inatomic)
				68	{ movei r29, IS_COPY_FROM_USER_ZEROING; j memcpy_common }
				69	.size __copy_from_user_zeroing, . - __copy_from_user_zeroing
				70
				71	/* __copy_to_user_inatomic takes the user target address in r0,
				72	* the kernel source in r1, and the bytes to copy in r2.
				73	* It returns the number of uncopiable bytes (hopefully zero) in r0.
				74	*/
				75	ENTRY(__copy_to_user_inatomic)
				76	.type __copy_to_user_inatomic, @function
				77	FEEDBACK_REENTER(__copy_from_user_inatomic)
				78	{ movei r29, IS_COPY_TO_USER; j memcpy_common }
				79	.size __copy_to_user_inatomic, . - __copy_to_user_inatomic
				80
				81	ENTRY(memcpy)
				82	.type memcpy, @function
				83	FEEDBACK_REENTER(__copy_from_user_inatomic)
				84	{ movei r29, IS_MEMCPY }
				85	.size memcpy, . - memcpy
				86	/* Fall through */
				87
				88	.type memcpy_common, @function
				89	memcpy_common:
				90	/* On entry, r29 holds one of the IS_* macro values from above. */
				91
				92
				93	/* r0 is the dest, r1 is the source, r2 is the size. */
				94
				95	/* Save aside original dest so we can return it at the end. */
				96	{ sw sp, lr; move r23, r0; or r4, r0, r1 }
				97
				98	/* Check for an empty size. */
				99	{ bz r2, .Ldone; andi r4, r4, 3 }
				100
				101	/* Save aside original values in case of a fault. */
				102	{ move r24, r1; move r25, r2 }
				103	move r27, lr
				104
				105	/* Check for an unaligned source or dest. */
				106	{ bnz r4, .Lcopy_unaligned_maybe_many; addli r4, r2, -256 }
				107
				108	.Lcheck_aligned_copy_size:
				109	/* If we are copying < 256 bytes, branch to simple case. */
				110	{ blzt r4, .Lcopy_8_check; slti_u r8, r2, 8 }
				111
				112	/* Copying >= 256 bytes, so jump to complex prefetching loop. */
				113	{ andi r6, r1, 63; j .Lcopy_many }
				114
				115	/*
				116	*
				117	* Aligned 4 byte at a time copy loop
				118	*
				119	*/
				120
				121	.Lcopy_8_loop:
				122	/* Copy two words at a time to hide load latency. */
				123	EX: { lw r3, r1; addi r1, r1, 4; slti_u r8, r2, 16 }
				124	EX: { lw r4, r1; addi r1, r1, 4 }
				125	EX: { sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 }
				126	EX: { sw r0, r4; addi r0, r0, 4; addi r2, r2, -4 }
				127	.Lcopy_8_check:
				128	{ bzt r8, .Lcopy_8_loop; slti_u r4, r2, 4 }
				129
				130	/* Copy odd leftover word, if any. */
				131	{ bnzt r4, .Lcheck_odd_stragglers }
				132	EX: { lw r3, r1; addi r1, r1, 4 }
				133	EX: { sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 }
				134
				135	.Lcheck_odd_stragglers:
				136	{ bnz r2, .Lcopy_unaligned_few }
				137
				138	.Ldone:
				139	/* For memcpy return original dest address, else zero. */
				140	{ mz r0, r29, r23; jrp lr }
				141
				142
				143	/*
				144	*
				145	* Prefetching multiple cache line copy handler (for large transfers).
				146	*
				147	*/
				148
				149	/* Copy words until r1 is cache-line-aligned. */
				150	.Lalign_loop:
				151	EX: { lw r3, r1; addi r1, r1, 4 }
				152	{ andi r6, r1, 63 }
				153	EX: { sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 }
				154	.Lcopy_many:
				155	{ bnzt r6, .Lalign_loop; addi r9, r0, 63 }
				156
				157	{ addi r3, r1, 60; andi r9, r9, -64 }
				158
Chris Metcalf	c745a8a	2010-08-13 08:52:19 -0400	[diff] [blame]	159	#if CHIP_HAS_WH64()
Chris Metcalf	867e359	2010-05-28 23:09:12 -0400	[diff] [blame]	160	/* No need to prefetch dst, we'll just do the wh64
				161	* right before we copy a line.
				162	*/
				163	#endif
				164
				165	EX: { lw r5, r3; addi r3, r3, 64; movei r4, 1 }
				166	/* Intentionally stall for a few cycles to leave L2 cache alone. */
				167	{ bnzt zero, .; move r27, lr }
				168	EX: { lw r6, r3; addi r3, r3, 64 }
				169	/* Intentionally stall for a few cycles to leave L2 cache alone. */
				170	{ bnzt zero, . }
				171	EX: { lw r7, r3; addi r3, r3, 64 }
Chris Metcalf	c745a8a	2010-08-13 08:52:19 -0400	[diff] [blame]	172	#if !CHIP_HAS_WH64()
Chris Metcalf	867e359	2010-05-28 23:09:12 -0400	[diff] [blame]	173	/* Prefetch the dest */
				174	/* Intentionally stall for a few cycles to leave L2 cache alone. */
				175	{ bnzt zero, . }
				176	/* Use a real load to cause a TLB miss if necessary. We aren't using
				177	* r28, so this should be fine.
				178	*/
				179	EX: { lw r28, r9; addi r9, r9, 64 }
				180	/* Intentionally stall for a few cycles to leave L2 cache alone. */
				181	{ bnzt zero, . }
				182	{ prefetch r9; addi r9, r9, 64 }
				183	/* Intentionally stall for a few cycles to leave L2 cache alone. */
				184	{ bnzt zero, . }
				185	{ prefetch r9; addi r9, r9, 64 }
				186	#endif
				187	/* Intentionally stall for a few cycles to leave L2 cache alone. */
				188	{ bz zero, .Lbig_loop2 }
				189
				190	/* On entry to this loop:
				191	* - r0 points to the start of dst line 0
				192	* - r1 points to start of src line 0
				193	* - r2 >= (256 - 60), only the first time the loop trips.
				194	* - r3 contains r1 + 128 + 60 [pointer to end of source line 2]
				195	* This is our prefetch address. When we get near the end
				196	* rather than prefetching off the end this is changed to point
				197	* to some "safe" recently loaded address.
				198	* - r5 contains *(r1 + 60) [i.e. last word of source line 0]
				199	* - r6 contains *(r1 + 64 + 60) [i.e. last word of source line 1]
				200	* - r9 contains ((r0 + 63) & -64)
				201	* [start of next dst cache line.]
				202	*/
				203
				204	.Lbig_loop:
				205	{ jal .Lcopy_line2; add r15, r1, r2 }
				206
				207	.Lbig_loop2:
				208	/* Copy line 0, first stalling until r5 is ready. */
				209	EX: { move r12, r5; lw r16, r1 }
				210	{ bz r4, .Lcopy_8_check; slti_u r8, r2, 8 }
				211	/* Prefetch several lines ahead. */
				212	EX: { lw r5, r3; addi r3, r3, 64 }
				213	{ jal .Lcopy_line }
				214
				215	/* Copy line 1, first stalling until r6 is ready. */
				216	EX: { move r12, r6; lw r16, r1 }
				217	{ bz r4, .Lcopy_8_check; slti_u r8, r2, 8 }
				218	/* Prefetch several lines ahead. */
				219	EX: { lw r6, r3; addi r3, r3, 64 }
				220	{ jal .Lcopy_line }
				221
				222	/* Copy line 2, first stalling until r7 is ready. */
				223	EX: { move r12, r7; lw r16, r1 }
				224	{ bz r4, .Lcopy_8_check; slti_u r8, r2, 8 }
				225	/* Prefetch several lines ahead. */
				226	EX: { lw r7, r3; addi r3, r3, 64 }
				227	/* Use up a caches-busy cycle by jumping back to the top of the
				228	* loop. Might as well get it out of the way now.
				229	*/
				230	{ j .Lbig_loop }
				231
				232
				233	/* On entry:
				234	* - r0 points to the destination line.
				235	* - r1 points to the source line.
				236	* - r3 is the next prefetch address.
				237	* - r9 holds the last address used for wh64.
				238	* - r12 = WORD_15
				239	* - r16 = WORD_0.
				240	* - r17 == r1 + 16.
				241	* - r27 holds saved lr to restore.
				242	*
				243	* On exit:
				244	* - r0 is incremented by 64.
				245	* - r1 is incremented by 64, unless that would point to a word
				246	* beyond the end of the source array, in which case it is redirected
				247	* to point to an arbitrary word already in the cache.
				248	* - r2 is decremented by 64.
				249	* - r3 is unchanged, unless it points to a word beyond the
				250	* end of the source array, in which case it is redirected
				251	* to point to an arbitrary word already in the cache.
				252	* Redirecting is OK since if we are that close to the end
				253	* of the array we will not come back to this subroutine
				254	* and use the contents of the prefetched address.
				255	* - r4 is nonzero iff r2 >= 64.
				256	* - r9 is incremented by 64, unless it points beyond the
				257	* end of the last full destination cache line, in which
				258	* case it is redirected to a "safe address" that can be
				259	* clobbered (sp - 64)
				260	* - lr contains the value in r27.
				261	*/
				262
				263	/* r26 unused */
				264
				265	.Lcopy_line:
				266	/* TODO: when r3 goes past the end, we would like to redirect it
				267	* to prefetch the last partial cache line (if any) just once, for the
				268	* benefit of the final cleanup loop. But we don't want to
				269	* prefetch that line more than once, or subsequent prefetches
				270	* will go into the RTF. But then .Lbig_loop should unconditionally
				271	* branch to top of loop to execute final prefetch, and its
				272	* nop should become a conditional branch.
				273	*/
				274
				275	/* We need two non-memory cycles here to cover the resources
				276	* used by the loads initiated by the caller.
				277	*/
				278	{ add r15, r1, r2 }
				279	.Lcopy_line2:
				280	{ slt_u r13, r3, r15; addi r17, r1, 16 }
				281
				282	/* NOTE: this will stall for one cycle as L1 is busy. */
				283
				284	/* Fill second L1D line. */
				285	EX: { lw r17, r17; addi r1, r1, 48; mvz r3, r13, r1 } /* r17 = WORD_4 */
				286
Chris Metcalf	c745a8a	2010-08-13 08:52:19 -0400	[diff] [blame]	287	#if CHIP_HAS_WH64()
Chris Metcalf	867e359	2010-05-28 23:09:12 -0400	[diff] [blame]	288	/* Prepare destination line for writing. */
				289	EX: { wh64 r9; addi r9, r9, 64 }
				290	#else
				291	/* Prefetch dest line */
				292	{ prefetch r9; addi r9, r9, 64 }
				293	#endif
				294	/* Load seven words that are L1D hits to cover wh64 L2 usage. */
				295
				296	/* Load the three remaining words from the last L1D line, which
				297	* we know has already filled the L1D.
				298	*/
				299	EX: { lw r4, r1; addi r1, r1, 4; addi r20, r1, 16 } /* r4 = WORD_12 */
				300	EX: { lw r8, r1; addi r1, r1, 4; slt_u r13, r20, r15 }/* r8 = WORD_13 */
				301	EX: { lw r11, r1; addi r1, r1, -52; mvz r20, r13, r1 } /* r11 = WORD_14 */
				302
				303	/* Load the three remaining words from the first L1D line, first
				304	* stalling until it has filled by "looking at" r16.
				305	*/
				306	EX: { lw r13, r1; addi r1, r1, 4; move zero, r16 } /* r13 = WORD_1 */
				307	EX: { lw r14, r1; addi r1, r1, 4 } /* r14 = WORD_2 */
				308	EX: { lw r15, r1; addi r1, r1, 8; addi r10, r0, 60 } /* r15 = WORD_3 */
				309
				310	/* Load second word from the second L1D line, first
				311	* stalling until it has filled by "looking at" r17.
				312	*/
				313	EX: { lw r19, r1; addi r1, r1, 4; move zero, r17 } /* r19 = WORD_5 */
				314
				315	/* Store last word to the destination line, potentially dirtying it
				316	* for the first time, which keeps the L2 busy for two cycles.
				317	*/
				318	EX: { sw r10, r12 } /* store(WORD_15) */
				319
				320	/* Use two L1D hits to cover the sw L2 access above. */
				321	EX: { lw r10, r1; addi r1, r1, 4 } /* r10 = WORD_6 */
				322	EX: { lw r12, r1; addi r1, r1, 4 } /* r12 = WORD_7 */
				323
				324	/* Fill third L1D line. */
				325	EX: { lw r18, r1; addi r1, r1, 4 } /* r18 = WORD_8 */
				326
				327	/* Store first L1D line. */
				328	EX: { sw r0, r16; addi r0, r0, 4; add r16, r0, r2 } /* store(WORD_0) */
				329	EX: { sw r0, r13; addi r0, r0, 4; andi r16, r16, -64 } /* store(WORD_1) */
				330	EX: { sw r0, r14; addi r0, r0, 4; slt_u r16, r9, r16 } /* store(WORD_2) */
Chris Metcalf	c745a8a	2010-08-13 08:52:19 -0400	[diff] [blame]	331	#if CHIP_HAS_WH64()
Chris Metcalf	867e359	2010-05-28 23:09:12 -0400	[diff] [blame]	332	EX: { sw r0, r15; addi r0, r0, 4; addi r13, sp, -64 } /* store(WORD_3) */
				333	#else
				334	/* Back up the r9 to a cache line we are already storing to
				335	* if it gets past the end of the dest vector. Strictly speaking,
				336	* we don't need to back up to the start of a cache line, but it's free
				337	* and tidy, so why not?
				338	*/
				339	EX: { sw r0, r15; addi r0, r0, 4; andi r13, r0, -64 } /* store(WORD_3) */
				340	#endif
				341	/* Store second L1D line. */
				342	EX: { sw r0, r17; addi r0, r0, 4; mvz r9, r16, r13 }/* store(WORD_4) */
				343	EX: { sw r0, r19; addi r0, r0, 4 } /* store(WORD_5) */
				344	EX: { sw r0, r10; addi r0, r0, 4 } /* store(WORD_6) */
				345	EX: { sw r0, r12; addi r0, r0, 4 } /* store(WORD_7) */
				346
				347	EX: { lw r13, r1; addi r1, r1, 4; move zero, r18 } /* r13 = WORD_9 */
				348	EX: { lw r14, r1; addi r1, r1, 4 } /* r14 = WORD_10 */
				349	EX: { lw r15, r1; move r1, r20 } /* r15 = WORD_11 */
				350
				351	/* Store third L1D line. */
				352	EX: { sw r0, r18; addi r0, r0, 4 } /* store(WORD_8) */
				353	EX: { sw r0, r13; addi r0, r0, 4 } /* store(WORD_9) */
				354	EX: { sw r0, r14; addi r0, r0, 4 } /* store(WORD_10) */
				355	EX: { sw r0, r15; addi r0, r0, 4 } /* store(WORD_11) */
				356
				357	/* Store rest of fourth L1D line. */
				358	EX: { sw r0, r4; addi r0, r0, 4 } /* store(WORD_12) */
				359	{
				360	EX: sw r0, r8 /* store(WORD_13) */
				361	addi r0, r0, 4
				362	/* Will r2 be > 64 after we subtract 64 below? */
				363	shri r4, r2, 7
				364	}
				365	{
				366	EX: sw r0, r11 /* store(WORD_14) */
				367	addi r0, r0, 8
				368	/* Record 64 bytes successfully copied. */
				369	addi r2, r2, -64
				370	}
				371
				372	{ jrp lr; move lr, r27 }
				373
				374	/* Convey to the backtrace library that the stack frame is size
				375	* zero, and the real return address is on the stack rather than
				376	* in 'lr'.
				377	*/
				378	{ info 8 }
				379
				380	.align 64
				381	.Lcopy_unaligned_maybe_many:
				382	/* Skip the setup overhead if we aren't copying many bytes. */
				383	{ slti_u r8, r2, 20; sub r4, zero, r0 }
				384	{ bnzt r8, .Lcopy_unaligned_few; andi r4, r4, 3 }
				385	{ bz r4, .Ldest_is_word_aligned; add r18, r1, r2 }
				386
				387	/*
				388	*
				389	* unaligned 4 byte at a time copy handler.
				390	*
				391	*/
				392
				393	/* Copy single bytes until r0 == 0 mod 4, so we can store words. */
				394	.Lalign_dest_loop:
				395	EX: { lb_u r3, r1; addi r1, r1, 1; addi r4, r4, -1 }
				396	EX: { sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 }
				397	{ bnzt r4, .Lalign_dest_loop; andi r3, r1, 3 }
				398
				399	/* If source and dest are now both aligned, do an aligned copy. */
				400	{ bz r3, .Lcheck_aligned_copy_size; addli r4, r2, -256 }
				401
				402	.Ldest_is_word_aligned:
				403
				404	#if CHIP_HAS_DWORD_ALIGN()
				405	EX: { andi r8, r0, 63; lwadd_na r6, r1, 4}
				406	{ slti_u r9, r2, 64; bz r8, .Ldest_is_L2_line_aligned }
				407
				408	/* This copies unaligned words until either there are fewer
				409	* than 4 bytes left to copy, or until the destination pointer
				410	* is cache-aligned, whichever comes first.
				411	*
				412	* On entry:
				413	* - r0 is the next store address.
				414	* - r1 points 4 bytes past the load address corresponding to r0.
				415	* - r2 >= 4
				416	* - r6 is the next aligned word loaded.
				417	*/
				418	.Lcopy_unaligned_src_words:
				419	EX: { lwadd_na r7, r1, 4; slti_u r8, r2, 4 + 4 }
				420	/* stall */
				421	{ dword_align r6, r7, r1; slti_u r9, r2, 64 + 4 }
				422	EX: { swadd r0, r6, 4; addi r2, r2, -4 }
				423	{ bnz r8, .Lcleanup_unaligned_words; andi r8, r0, 63 }
				424	{ bnzt r8, .Lcopy_unaligned_src_words; move r6, r7 }
				425
				426	/* On entry:
				427	* - r0 is the next store address.
				428	* - r1 points 4 bytes past the load address corresponding to r0.
				429	* - r2 >= 4 (# of bytes left to store).
				430	* - r6 is the next aligned src word value.
				431	* - r9 = (r2 < 64U).
				432	* - r18 points one byte past the end of source memory.
				433	*/
				434	.Ldest_is_L2_line_aligned:
				435
				436	{
				437	/* Not a full cache line remains. */
				438	bnz r9, .Lcleanup_unaligned_words
				439	move r7, r6
				440	}
				441
				442	/* r2 >= 64 */
				443
				444	/* Kick off two prefetches, but don't go past the end. */
				445	{ addi r3, r1, 63 - 4; addi r8, r1, 64 + 63 - 4 }
				446	{ prefetch r3; move r3, r8; slt_u r8, r8, r18 }
				447	{ mvz r3, r8, r1; addi r8, r3, 64 }
				448	{ prefetch r3; move r3, r8; slt_u r8, r8, r18 }
				449	{ mvz r3, r8, r1; movei r17, 0 }
				450
				451	.Lcopy_unaligned_line:
				452	/* Prefetch another line. */
				453	{ prefetch r3; addi r15, r1, 60; addi r3, r3, 64 }
				454	/* Fire off a load of the last word we are about to copy. */
				455	EX: { lw_na r15, r15; slt_u r8, r3, r18 }
				456
				457	EX: { mvz r3, r8, r1; wh64 r0 }
				458
				459	/* This loop runs twice.
				460	*
				461	* On entry:
				462	* - r17 is even before the first iteration, and odd before
				463	* the second. It is incremented inside the loop. Encountering
				464	* an even value at the end of the loop makes it stop.
				465	*/
				466	.Lcopy_half_an_unaligned_line:
				467	EX: {
				468	/* Stall until the last byte is ready. In the steady state this
				469	* guarantees all words to load below will be in the L2 cache, which
				470	* avoids shunting the loads to the RTF.
				471	*/
				472	move zero, r15
				473	lwadd_na r7, r1, 16
				474	}
				475	EX: { lwadd_na r11, r1, 12 }
				476	EX: { lwadd_na r14, r1, -24 }
				477	EX: { lwadd_na r8, r1, 4 }
				478	EX: { lwadd_na r9, r1, 4 }
				479	EX: {
				480	lwadd_na r10, r1, 8
				481	/* r16 = (r2 < 64), after we subtract 32 from r2 below. */
				482	slti_u r16, r2, 64 + 32
				483	}
				484	EX: { lwadd_na r12, r1, 4; addi r17, r17, 1 }
				485	EX: { lwadd_na r13, r1, 8; dword_align r6, r7, r1 }
				486	EX: { swadd r0, r6, 4; dword_align r7, r8, r1 }
				487	EX: { swadd r0, r7, 4; dword_align r8, r9, r1 }
				488	EX: { swadd r0, r8, 4; dword_align r9, r10, r1 }
				489	EX: { swadd r0, r9, 4; dword_align r10, r11, r1 }
				490	EX: { swadd r0, r10, 4; dword_align r11, r12, r1 }
				491	EX: { swadd r0, r11, 4; dword_align r12, r13, r1 }
				492	EX: { swadd r0, r12, 4; dword_align r13, r14, r1 }
				493	EX: { swadd r0, r13, 4; addi r2, r2, -32 }
				494	{ move r6, r14; bbst r17, .Lcopy_half_an_unaligned_line }
				495
				496	{ bzt r16, .Lcopy_unaligned_line; move r7, r6 }
				497
				498	/* On entry:
				499	* - r0 is the next store address.
				500	* - r1 points 4 bytes past the load address corresponding to r0.
				501	* - r2 >= 0 (# of bytes left to store).
				502	* - r7 is the next aligned src word value.
				503	*/
				504	.Lcleanup_unaligned_words:
				505	/* Handle any trailing bytes. */
				506	{ bz r2, .Lcopy_unaligned_done; slti_u r8, r2, 4 }
				507	{ bzt r8, .Lcopy_unaligned_src_words; move r6, r7 }
				508
				509	/* Move r1 back to the point where it corresponds to r0. */
				510	{ addi r1, r1, -4 }
				511
				512	#else /* !CHIP_HAS_DWORD_ALIGN() */
				513
				514	/* Compute right/left shift counts and load initial source words. */
				515	{ andi r5, r1, -4; andi r3, r1, 3 }
				516	EX: { lw r6, r5; addi r5, r5, 4; shli r3, r3, 3 }
				517	EX: { lw r7, r5; addi r5, r5, 4; sub r4, zero, r3 }
				518
				519	/* Load and store one word at a time, using shifts and ORs
				520	* to correct for the misaligned src.
				521	*/
				522	.Lcopy_unaligned_src_loop:
				523	{ shr r6, r6, r3; shl r8, r7, r4 }
				524	EX: { lw r7, r5; or r8, r8, r6; move r6, r7 }
				525	EX: { sw r0, r8; addi r0, r0, 4; addi r2, r2, -4 }
				526	{ addi r5, r5, 4; slti_u r8, r2, 8 }
				527	{ bzt r8, .Lcopy_unaligned_src_loop; addi r1, r1, 4 }
				528
				529	{ bz r2, .Lcopy_unaligned_done }
				530	#endif /* !CHIP_HAS_DWORD_ALIGN() */
				531
				532	/* Fall through */
				533
				534	/*
				535	*
				536	* 1 byte at a time copy handler.
				537	*
				538	*/
				539
				540	.Lcopy_unaligned_few:
				541	EX: { lb_u r3, r1; addi r1, r1, 1 }
				542	EX: { sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 }
				543	{ bnzt r2, .Lcopy_unaligned_few }
				544
				545	.Lcopy_unaligned_done:
				546
				547	/* For memcpy return original dest address, else zero. */
				548	{ mz r0, r29, r23; jrp lr }
				549
				550	.Lend_memcpy_common:
				551	.size memcpy_common, .Lend_memcpy_common - memcpy_common
				552
				553	.section .fixup,"ax"
				554	memcpy_common_fixup:
				555	.type memcpy_common_fixup, @function
				556
				557	/* Skip any bytes we already successfully copied.
				558	* r2 (num remaining) is correct, but r0 (dst) and r1 (src)
				559	* may not be quite right because of unrolling and prefetching.
				560	* So we need to recompute their values as the address just
				561	* after the last byte we are sure was successfully loaded and
				562	* then stored.
				563	*/
				564
				565	/* Determine how many bytes we successfully copied. */
				566	{ sub r3, r25, r2 }
				567
				568	/* Add this to the original r0 and r1 to get their new values. */
				569	{ add r0, r23, r3; add r1, r24, r3 }
				570
				571	{ bzt r29, memcpy_fixup_loop }
				572	{ blzt r29, copy_to_user_fixup_loop }
				573
				574	copy_from_user_fixup_loop:
				575	/* Try copying the rest one byte at a time, expecting a load fault. */
				576	.Lcfu: { lb_u r3, r1; addi r1, r1, 1 }
				577	{ sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 }
				578	{ bnzt r2, copy_from_user_fixup_loop }
				579
				580	.Lcopy_from_user_fixup_zero_remainder:
				581	{ bbs r29, 2f } /* low bit set means IS_COPY_FROM_USER */
				582	/* byte-at-a-time loop faulted, so zero the rest. */
				583	{ move r3, r2; bz r2, 2f /* should be impossible, but handle it. */ }
				584	1: { sb r0, zero; addi r0, r0, 1; addi r3, r3, -1 }
				585	{ bnzt r3, 1b }
				586	2: move lr, r27
				587	{ move r0, r2; jrp lr }
				588
				589	copy_to_user_fixup_loop:
				590	/* Try copying the rest one byte at a time, expecting a store fault. */
				591	{ lb_u r3, r1; addi r1, r1, 1 }
				592	.Lctu: { sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 }
				593	{ bnzt r2, copy_to_user_fixup_loop }
				594	.Lcopy_to_user_fixup_done:
				595	move lr, r27
				596	{ move r0, r2; jrp lr }
				597
				598	memcpy_fixup_loop:
				599	/* Try copying the rest one byte at a time. We expect a disastrous
				600	* fault to happen since we are in fixup code, but let it happen.
				601	*/
				602	{ lb_u r3, r1; addi r1, r1, 1 }
				603	{ sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 }
				604	{ bnzt r2, memcpy_fixup_loop }
				605	/* This should be unreachable, we should have faulted again.
				606	* But be paranoid and handle it in case some interrupt changed
				607	* the TLB or something.
				608	*/
				609	move lr, r27
				610	{ move r0, r23; jrp lr }
				611
				612	.size memcpy_common_fixup, . - memcpy_common_fixup
				613
				614	.section __ex_table,"a"
				615	.word .Lcfu, .Lcopy_from_user_fixup_zero_remainder
				616	.word .Lctu, .Lcopy_to_user_fixup_done