| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 1 | /* | 
 | 2 |  * arch/alpha/lib/ev6-copy_user.S | 
 | 3 |  * | 
 | 4 |  * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> | 
 | 5 |  * | 
 | 6 |  * Copy to/from user space, handling exceptions as we go..  This | 
 | 7 |  * isn't exactly pretty. | 
 | 8 |  * | 
 | 9 |  * This is essentially the same as "memcpy()", but with a few twists. | 
 | 10 |  * Notably, we have to make sure that $0 is always up-to-date and | 
 | 11 |  * contains the right "bytes left to copy" value (and that it is updated | 
 | 12 |  * only _after_ a successful copy). There is also some rather minor | 
 | 13 |  * exception setup stuff.. | 
 | 14 |  * | 
 | 15 |  * NOTE! This is not directly C-callable, because the calling semantics are | 
 | 16 |  * different: | 
 | 17 |  * | 
 | 18 |  * Inputs: | 
 | 19 |  *	length in $0 | 
 | 20 |  *	destination address in $6 | 
 | 21 |  *	source address in $7 | 
 | 22 |  *	return address in $28 | 
 | 23 |  * | 
 | 24 |  * Outputs: | 
 | 25 |  *	bytes left to copy in $0 | 
 | 26 |  * | 
 | 27 |  * Clobbers: | 
 | 28 |  *	$1,$2,$3,$4,$5,$6,$7 | 
 | 29 |  * | 
 | 30 |  * Much of the information about 21264 scheduling/coding comes from: | 
 | 31 |  *	Compiler Writer's Guide for the Alpha 21264 | 
 | 32 |  *	abbreviated as 'CWG' in other comments here | 
 | 33 |  *	ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html | 
 | 34 |  * Scheduling notation: | 
 | 35 |  *	E	- either cluster | 
 | 36 |  *	U	- upper subcluster; U0 - subcluster U0; U1 - subcluster U1 | 
 | 37 |  *	L	- lower subcluster; L0 - subcluster L0; L1 - subcluster L1 | 
 | 38 |  */ | 
 | 39 |  | 
 | 40 | /* Allow an exception for an insn; exit if we get one.  */ | 
 | 41 | #define EXI(x,y...)			\ | 
 | 42 | 	99: x,##y;			\ | 
 | 43 | 	.section __ex_table,"a";	\ | 
 | 44 | 	.long 99b - .;			\ | 
 | 45 | 	lda $31, $exitin-99b($31);	\ | 
 | 46 | 	.previous | 
 | 47 |  | 
 | 48 | #define EXO(x,y...)			\ | 
 | 49 | 	99: x,##y;			\ | 
 | 50 | 	.section __ex_table,"a";	\ | 
 | 51 | 	.long 99b - .;			\ | 
 | 52 | 	lda $31, $exitout-99b($31);	\ | 
 | 53 | 	.previous | 
 | 54 |  | 
 | 55 | 	.set noat | 
 | 56 | 	.align 4 | 
 | 57 | 	.globl __copy_user | 
 | 58 | 	.ent __copy_user | 
 | 59 | 				# Pipeline info: Slotting & Comments | 
 | 60 | __copy_user: | 
 | 61 | 	.prologue 0 | 
 | 62 | 	subq $0, 32, $1		# .. E  .. ..	: Is this going to be a small copy? | 
 | 63 | 	beq $0, $zerolength	# U  .. .. ..	: U L U L | 
 | 64 |  | 
 | 65 | 	and $6,7,$3		# .. .. .. E	: is leading dest misalignment | 
 | 66 | 	ble $1, $onebyteloop	# .. .. U  ..	: 1st branch : small amount of data | 
 | 67 | 	beq $3, $destaligned	# .. U  .. ..	: 2nd (one cycle fetcher stall) | 
 | 68 | 	subq $3, 8, $3		# E  .. .. ..	: L U U L : trip counter | 
 | 69 | /* | 
 | 70 |  * The fetcher stall also hides the 1 cycle cross-cluster stall for $3 (L --> U) | 
 | 71 |  * This loop aligns the destination a byte at a time | 
 | 72 |  * We know we have at least one trip through this loop | 
 | 73 |  */ | 
 | 74 | $aligndest: | 
 | 75 | 	EXI( ldbu $1,0($7) )	# .. .. .. L	: Keep loads separate from stores | 
 | 76 | 	addq $6,1,$6		# .. .. E  ..	: Section 3.8 in the CWG | 
 | 77 | 	addq $3,1,$3		# .. E  .. ..	: | 
 | 78 | 	nop			# E  .. .. ..	: U L U L | 
 | 79 |  | 
 | 80 | /* | 
 | 81 |  * the -1 is to compensate for the inc($6) done in a previous quadpack | 
 | 82 |  * which allows us zero dependencies within either quadpack in the loop | 
 | 83 |  */ | 
 | 84 | 	EXO( stb $1,-1($6) )	# .. .. .. L	: | 
 | 85 | 	addq $7,1,$7		# .. .. E  ..	: Section 3.8 in the CWG | 
 | 86 | 	subq $0,1,$0		# .. E  .. ..	: | 
 | 87 | 	bne $3, $aligndest	# U  .. .. ..	: U L U L | 
 | 88 |  | 
 | 89 | /* | 
 | 90 |  * If we fell through into here, we have a minimum of 33 - 7 bytes | 
 | 91 |  * If we arrived via branch, we have a minimum of 32 bytes | 
 | 92 |  */ | 
 | 93 | $destaligned: | 
 | 94 | 	and $7,7,$1		# .. .. .. E	: Check _current_ source alignment | 
 | 95 | 	bic $0,7,$4		# .. .. E  ..	: number bytes as a quadword loop | 
 | 96 | 	EXI( ldq_u $3,0($7) )	# .. L  .. ..	: Forward fetch for fallthrough code | 
 | 97 | 	beq $1,$quadaligned	# U  .. .. ..	: U L U L | 
 | 98 |  | 
 | 99 | /* | 
 | 100 |  * In the worst case, we've just executed an ldq_u here from 0($7) | 
 | 101 |  * and we'll repeat it once if we take the branch | 
 | 102 |  */ | 
 | 103 |  | 
 | 104 | /* Misaligned quadword loop - not unrolled.  Leave it that way. */ | 
 | 105 | $misquad: | 
 | 106 | 	EXI( ldq_u $2,8($7) )	# .. .. .. L	: | 
 | 107 | 	subq $4,8,$4		# .. .. E  ..	: | 
 | 108 | 	extql $3,$7,$3		# .. U  .. ..	: | 
 | 109 | 	extqh $2,$7,$1		# U  .. .. ..	: U U L L | 
 | 110 |  | 
 | 111 | 	bis $3,$1,$1		# .. .. .. E	: | 
 | 112 | 	EXO( stq $1,0($6) )	# .. .. L  ..	: | 
 | 113 | 	addq $7,8,$7		# .. E  .. ..	: | 
 | 114 | 	subq $0,8,$0		# E  .. .. ..	: U L L U | 
 | 115 |  | 
 | 116 | 	addq $6,8,$6		# .. .. .. E	: | 
 | 117 | 	bis $2,$2,$3		# .. .. E  ..	: | 
 | 118 | 	nop			# .. E  .. ..	: | 
 | 119 | 	bne $4,$misquad		# U  .. .. ..	: U L U L | 
 | 120 |  | 
 | 121 | 	nop			# .. .. .. E | 
 | 122 | 	nop			# .. .. E  .. | 
 | 123 | 	nop			# .. E  .. .. | 
 | 124 | 	beq $0,$zerolength	# U  .. .. ..	: U L U L | 
 | 125 |  | 
 | 126 | /* We know we have at least one trip through the byte loop */ | 
 | 127 | 	EXI ( ldbu $2,0($7) )	# .. .. .. L	: No loads in the same quad | 
 | 128 | 	addq $6,1,$6		# .. .. E  ..	: as the store (Section 3.8 in CWG) | 
 | 129 | 	nop			# .. E  .. ..	: | 
 | 130 | 	br $31, $dirtyentry	# L0 .. .. ..	: L U U L | 
 | 131 | /* Do the trailing byte loop load, then hop into the store part of the loop */ | 
 | 132 |  | 
 | 133 | /* | 
 | 134 |  * A minimum of (33 - 7) bytes to do a quad at a time. | 
 | 135 |  * Based upon the usage context, it's worth the effort to unroll this loop | 
 | 136 |  * $0 - number of bytes to be moved | 
 | 137 |  * $4 - number of bytes to move as quadwords | 
 | 138 |  * $6 is current destination address | 
 | 139 |  * $7 is current source address | 
 | 140 |  */ | 
 | 141 | $quadaligned: | 
 | 142 | 	subq	$4, 32, $2	# .. .. .. E	: do not unroll for small stuff | 
 | 143 | 	nop			# .. .. E  .. | 
 | 144 | 	nop			# .. E  .. .. | 
 | 145 | 	blt	$2, $onequad	# U  .. .. ..	: U L U L | 
 | 146 |  | 
 | 147 | /* | 
 | 148 |  * There is a significant assumption here that the source and destination | 
 | 149 |  * addresses differ by more than 32 bytes.  In this particular case, a | 
 | 150 |  * sparsity of registers further bounds this to be a minimum of 8 bytes. | 
 | 151 |  * But if this isn't met, then the output result will be incorrect. | 
 | 152 |  * Furthermore, due to a lack of available registers, we really can't | 
 | 153 |  * unroll this to be an 8x loop (which would enable us to use the wh64 | 
 | 154 |  * instruction memory hint instruction). | 
 | 155 |  */ | 
 | 156 | $unroll4: | 
 | 157 | 	EXI( ldq $1,0($7) )	# .. .. .. L | 
 | 158 | 	EXI( ldq $2,8($7) )	# .. .. L  .. | 
 | 159 | 	subq	$4,32,$4	# .. E  .. .. | 
 | 160 | 	nop			# E  .. .. ..	: U U L L | 
 | 161 |  | 
 | 162 | 	addq	$7,16,$7	# .. .. .. E | 
 | 163 | 	EXO( stq $1,0($6) )	# .. .. L  .. | 
 | 164 | 	EXO( stq $2,8($6) )	# .. L  .. .. | 
 | 165 | 	subq	$0,16,$0	# E  .. .. ..	: U L L U | 
 | 166 |  | 
 | 167 | 	addq	$6,16,$6	# .. .. .. E | 
 | 168 | 	EXI( ldq $1,0($7) )	# .. .. L  .. | 
 | 169 | 	EXI( ldq $2,8($7) )	# .. L  .. .. | 
 | 170 | 	subq	$4, 32, $3	# E  .. .. ..	: U U L L : is there enough for another trip? | 
 | 171 |  | 
 | 172 | 	EXO( stq $1,0($6) )	# .. .. .. L | 
 | 173 | 	EXO( stq $2,8($6) )	# .. .. L  .. | 
 | 174 | 	subq	$0,16,$0	# .. E  .. .. | 
 | 175 | 	addq	$7,16,$7	# E  .. .. ..	: U L L U | 
 | 176 |  | 
 | 177 | 	nop			# .. .. .. E | 
 | 178 | 	nop			# .. .. E  .. | 
 | 179 | 	addq	$6,16,$6	# .. E  .. .. | 
 | 180 | 	bgt	$3,$unroll4	# U  .. .. ..	: U L U L | 
 | 181 |  | 
 | 182 | 	nop | 
 | 183 | 	nop | 
 | 184 | 	nop | 
 | 185 | 	beq	$4, $noquads | 
 | 186 |  | 
 | 187 | $onequad: | 
 | 188 | 	EXI( ldq $1,0($7) ) | 
 | 189 | 	subq	$4,8,$4 | 
 | 190 | 	addq	$7,8,$7 | 
 | 191 | 	nop | 
 | 192 |  | 
 | 193 | 	EXO( stq $1,0($6) ) | 
 | 194 | 	subq	$0,8,$0 | 
 | 195 | 	addq	$6,8,$6 | 
 | 196 | 	bne	$4,$onequad | 
 | 197 |  | 
 | 198 | $noquads: | 
 | 199 | 	nop | 
 | 200 | 	nop | 
 | 201 | 	nop | 
 | 202 | 	beq $0,$zerolength | 
 | 203 |  | 
 | 204 | /* | 
 | 205 |  * For small copies (or the tail of a larger copy), do a very simple byte loop. | 
 | 206 |  * There's no point in doing a lot of complex alignment calculations to try to | 
 | 207 |  * to quadword stuff for a small amount of data. | 
 | 208 |  *	$0 - remaining number of bytes left to copy | 
 | 209 |  *	$6 - current dest addr | 
 | 210 |  *	$7 - current source addr | 
 | 211 |  */ | 
 | 212 |  | 
 | 213 | $onebyteloop: | 
 | 214 | 	EXI ( ldbu $2,0($7) )	# .. .. .. L	: No loads in the same quad | 
 | 215 | 	addq $6,1,$6		# .. .. E  ..	: as the store (Section 3.8 in CWG) | 
 | 216 | 	nop			# .. E  .. ..	: | 
 | 217 | 	nop			# E  .. .. ..	: U L U L | 
 | 218 |  | 
 | 219 | $dirtyentry: | 
 | 220 | /* | 
 | 221 |  * the -1 is to compensate for the inc($6) done in a previous quadpack | 
 | 222 |  * which allows us zero dependencies within either quadpack in the loop | 
 | 223 |  */ | 
 | 224 | 	EXO ( stb $2,-1($6) )	# .. .. .. L	: | 
 | 225 | 	addq $7,1,$7		# .. .. E  ..	: quadpack as the load | 
 | 226 | 	subq $0,1,$0		# .. E  .. ..	: change count _after_ copy | 
 | 227 | 	bgt $0,$onebyteloop	# U  .. .. ..	: U L U L | 
 | 228 |  | 
 | 229 | $zerolength: | 
 | 230 | $exitout:			# Destination for exception recovery(?) | 
 | 231 | 	nop			# .. .. .. E | 
 | 232 | 	nop			# .. .. E  .. | 
 | 233 | 	nop			# .. E  .. .. | 
 | 234 | 	ret $31,($28),1		# L0 .. .. ..	: L U L U | 
 | 235 |  | 
 | 236 | $exitin: | 
 | 237 |  | 
 | 238 | 	/* A stupid byte-by-byte zeroing of the rest of the output | 
 | 239 | 	   buffer.  This cures security holes by never leaving  | 
 | 240 | 	   random kernel data around to be copied elsewhere.  */ | 
 | 241 |  | 
 | 242 | 	nop | 
 | 243 | 	nop | 
 | 244 | 	nop | 
 | 245 | 	mov	$0,$1 | 
 | 246 |  | 
 | 247 | $101: | 
 | 248 | 	EXO ( stb $31,0($6) )	# L | 
 | 249 | 	subq $1,1,$1		# E | 
 | 250 | 	addq $6,1,$6		# E | 
 | 251 | 	bgt $1,$101		# U | 
 | 252 |  | 
 | 253 | 	nop | 
 | 254 | 	nop | 
 | 255 | 	nop | 
 | 256 | 	ret $31,($28),1		# L0 | 
 | 257 |  | 
 | 258 | 	.end __copy_user | 
 | 259 |  |