| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 1 | /* | 
 | 2 |  * arch/alpha/lib/ev6-divide.S | 
 | 3 |  * | 
 | 4 |  * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> | 
 | 5 |  * | 
 | 6 |  * Alpha division.. | 
 | 7 |  */ | 
 | 8 |  | 
 | 9 | /* | 
 | 10 |  * The alpha chip doesn't provide hardware division, so we have to do it | 
 | 11 |  * by hand.  The compiler expects the functions | 
 | 12 |  * | 
 | 13 |  *	__divqu: 64-bit unsigned long divide | 
 | 14 |  *	__remqu: 64-bit unsigned long remainder | 
 | 15 |  *	__divqs/__remqs: signed 64-bit | 
 | 16 |  *	__divlu/__remlu: unsigned 32-bit | 
 | 17 |  *	__divls/__remls: signed 32-bit | 
 | 18 |  * | 
 | 19 |  * These are not normal C functions: instead of the normal | 
 | 20 |  * calling sequence, these expect their arguments in registers | 
 | 21 |  * $24 and $25, and return the result in $27. Register $28 may | 
 | 22 |  * be clobbered (assembly temporary), anything else must be saved.  | 
 | 23 |  * | 
 | 24 |  * In short: painful. | 
 | 25 |  * | 
 | 26 |  * This is a rather simple bit-at-a-time algorithm: it's very good | 
 | 27 |  * at dividing random 64-bit numbers, but the more usual case where | 
 | 28 |  * the divisor is small is handled better by the DEC algorithm | 
 | 29 |  * using lookup tables. This uses much less memory, though, and is | 
 | 30 |  * nicer on the cache.. Besides, I don't know the copyright status | 
 | 31 |  * of the DEC code. | 
 | 32 |  */ | 
 | 33 |  | 
 | 34 | /* | 
 | 35 |  * My temporaries: | 
 | 36 |  *	$0 - current bit | 
 | 37 |  *	$1 - shifted divisor | 
 | 38 |  *	$2 - modulus/quotient | 
 | 39 |  * | 
 | 40 |  *	$23 - return address | 
 | 41 |  *	$24 - dividend | 
 | 42 |  *	$25 - divisor | 
 | 43 |  * | 
 | 44 |  *	$27 - quotient/modulus | 
 | 45 |  *	$28 - compare status | 
 | 46 |  * | 
 | 47 |  * Much of the information about 21264 scheduling/coding comes from: | 
 | 48 |  *	Compiler Writer's Guide for the Alpha 21264 | 
 | 49 |  *	abbreviated as 'CWG' in other comments here | 
 | 50 |  *	ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html | 
 | 51 |  * Scheduling notation: | 
 | 52 |  *	E	- either cluster | 
 | 53 |  *	U	- upper subcluster; U0 - subcluster U0; U1 - subcluster U1 | 
 | 54 |  *	L	- lower subcluster; L0 - subcluster L0; L1 - subcluster L1 | 
 | 55 |  * Try not to change the actual algorithm if possible for consistency. | 
 | 56 |  */ | 
 | 57 |  | 
 | 58 | #define halt .long 0 | 
 | 59 |  | 
 | 60 | /* | 
 | 61 |  * Select function type and registers | 
 | 62 |  */ | 
 | 63 | #define mask	$0 | 
 | 64 | #define divisor	$1 | 
 | 65 | #define compare $28 | 
 | 66 | #define tmp1	$3 | 
 | 67 | #define tmp2	$4 | 
 | 68 |  | 
 | 69 | #ifdef DIV | 
 | 70 | #define DIV_ONLY(x,y...) x,##y | 
 | 71 | #define MOD_ONLY(x,y...) | 
 | 72 | #define func(x) __div##x | 
 | 73 | #define modulus $2 | 
 | 74 | #define quotient $27 | 
 | 75 | #define GETSIGN(x) xor $24,$25,x | 
 | 76 | #define STACK 48 | 
 | 77 | #else | 
 | 78 | #define DIV_ONLY(x,y...) | 
 | 79 | #define MOD_ONLY(x,y...) x,##y | 
 | 80 | #define func(x) __rem##x | 
 | 81 | #define modulus $27 | 
 | 82 | #define quotient $2 | 
 | 83 | #define GETSIGN(x) bis $24,$24,x | 
 | 84 | #define STACK 32 | 
 | 85 | #endif | 
 | 86 |  | 
 | 87 | /* | 
 | 88 |  * For 32-bit operations, we need to extend to 64-bit | 
 | 89 |  */ | 
 | 90 | #ifdef INTSIZE | 
 | 91 | #define ufunction func(lu) | 
 | 92 | #define sfunction func(l) | 
 | 93 | #define LONGIFY(x) zapnot x,15,x | 
 | 94 | #define SLONGIFY(x) addl x,0,x | 
 | 95 | #else | 
 | 96 | #define ufunction func(qu) | 
 | 97 | #define sfunction func(q) | 
 | 98 | #define LONGIFY(x) | 
 | 99 | #define SLONGIFY(x) | 
 | 100 | #endif | 
 | 101 |  | 
 | 102 | .set noat | 
 | 103 | .align	4 | 
 | 104 | .globl	ufunction | 
 | 105 | .ent	ufunction | 
 | 106 | ufunction: | 
 | 107 | 	subq	$30,STACK,$30		# E : | 
 | 108 | 	.frame	$30,STACK,$23 | 
 | 109 | 	.prologue 0 | 
 | 110 |  | 
 | 111 | 7:	stq	$1, 0($30)		# L : | 
 | 112 | 	bis	$25,$25,divisor		# E : | 
 | 113 | 	stq	$2, 8($30)		# L : L U L U | 
 | 114 |  | 
 | 115 | 	bis	$24,$24,modulus		# E : | 
 | 116 | 	stq	$0,16($30)		# L : | 
 | 117 | 	bis	$31,$31,quotient	# E : | 
 | 118 | 	LONGIFY(divisor)		# E : U L L U | 
 | 119 |  | 
 | 120 | 	stq	tmp1,24($30)		# L : | 
 | 121 | 	LONGIFY(modulus)		# E : | 
 | 122 | 	bis	$31,1,mask		# E : | 
 | 123 | 	DIV_ONLY(stq tmp2,32($30))	# L : L U U L | 
 | 124 |  | 
 | 125 | 	beq	divisor, 9f			/* div by zero */ | 
 | 126 | 	/* | 
 | 127 | 	 * In spite of the DIV_ONLY being either a non-instruction | 
 | 128 | 	 * or an actual stq, the addition of the .align directive | 
 | 129 | 	 * below ensures that label 1 is going to be nicely aligned | 
 | 130 | 	 */ | 
 | 131 |  | 
 | 132 | 	.align	4 | 
 | 133 | #ifdef INTSIZE | 
 | 134 | 	/* | 
 | 135 | 	 * shift divisor left, using 3-bit shifts for | 
 | 136 | 	 * 32-bit divides as we can't overflow. Three-bit | 
 | 137 | 	 * shifts will result in looping three times less | 
 | 138 | 	 * here, but can result in two loops more later. | 
 | 139 | 	 * Thus using a large shift isn't worth it (and | 
 | 140 | 	 * s8add pairs better than a sll..) | 
 | 141 | 	 */ | 
 | 142 | 1:	cmpult	divisor,modulus,compare	# E : | 
 | 143 | 	s8addq	divisor,$31,divisor	# E : | 
 | 144 | 	s8addq	mask,$31,mask		# E : | 
 | 145 | 	bne	compare,1b		# U : U L U L | 
 | 146 | #else | 
 | 147 | 1:	cmpult	divisor,modulus,compare	# E : | 
 | 148 | 	nop				# E : | 
 | 149 | 	nop				# E : | 
 | 150 | 	blt     divisor, 2f		# U : U L U L | 
 | 151 |  | 
 | 152 | 	addq	divisor,divisor,divisor	# E : | 
 | 153 | 	addq	mask,mask,mask		# E : | 
 | 154 | 	unop				# E : | 
 | 155 | 	bne	compare,1b		# U : U L U L | 
 | 156 | #endif | 
 | 157 |  | 
 | 158 | 	/* ok, start to go right again.. */ | 
 | 159 | 2: | 
 | 160 | 	/* | 
 | 161 | 	 * Keep things nicely bundled... use a nop instead of not | 
 | 162 | 	 * having an instruction for DIV_ONLY | 
 | 163 | 	 */ | 
 | 164 | #ifdef DIV | 
 | 165 | 	DIV_ONLY(addq quotient,mask,tmp2) # E : | 
 | 166 | #else | 
 | 167 | 	nop				# E : | 
 | 168 | #endif | 
 | 169 | 	srl	mask,1,mask		# U : | 
 | 170 | 	cmpule	divisor,modulus,compare	# E : | 
 | 171 | 	subq	modulus,divisor,tmp1	# E : | 
 | 172 |  | 
 | 173 | #ifdef DIV | 
 | 174 | 	DIV_ONLY(cmovne compare,tmp2,quotient)	# E : Latency 2, extra map slot | 
 | 175 | 	nop				# E : as part of the cmovne | 
 | 176 | 	srl	divisor,1,divisor	# U : | 
 | 177 | 	nop				# E : L U L U | 
 | 178 |  | 
 | 179 | 	nop				# E : | 
 | 180 | 	cmovne	compare,tmp1,modulus	# E : Latency 2, extra map slot | 
 | 181 | 	nop				# E : as part of the cmovne | 
 | 182 | 	bne	mask,2b			# U : U L U L | 
 | 183 | #else | 
 | 184 | 	srl	divisor,1,divisor	# U : | 
 | 185 | 	cmovne	compare,tmp1,modulus	# E : Latency 2, extra map slot | 
 | 186 | 	nop				# E : as part of the cmovne | 
 | 187 | 	bne	mask,2b			# U : U L L U | 
 | 188 | #endif | 
 | 189 |  | 
 | 190 | 9:	ldq	$1, 0($30)		# L : | 
 | 191 | 	ldq	$2, 8($30)		# L : | 
 | 192 | 	nop				# E : | 
 | 193 | 	nop				# E : U U L L | 
 | 194 |  | 
 | 195 | 	ldq	$0,16($30)		# L : | 
 | 196 | 	ldq	tmp1,24($30)		# L : | 
 | 197 | 	nop				# E : | 
 | 198 | 	nop				# E : | 
 | 199 |  | 
 | 200 | #ifdef DIV | 
 | 201 | 	DIV_ONLY(ldq tmp2,32($30))	# L : | 
 | 202 | #else | 
 | 203 | 	nop				# E : | 
 | 204 | #endif | 
 | 205 | 	addq	$30,STACK,$30		# E : | 
 | 206 | 	ret	$31,($23),1		# L0 : L U U L | 
 | 207 | 	.end	ufunction | 
 | 208 |  | 
 | 209 | /* | 
 | 210 |  * Uhh.. Ugly signed division. I'd rather not have it at all, but | 
 | 211 |  * it's needed in some circumstances. There are different ways to | 
 | 212 |  * handle this, really. This does: | 
 | 213 |  * 	-a / b = a / -b = -(a / b) | 
 | 214 |  *	-a % b = -(a % b) | 
 | 215 |  *	a % -b = a % b | 
 | 216 |  * which is probably not the best solution, but at least should | 
 | 217 |  * have the property that (x/y)*y + (x%y) = x. | 
 | 218 |  */ | 
 | 219 | .align 4 | 
 | 220 | .globl	sfunction | 
 | 221 | .ent	sfunction | 
 | 222 | sfunction: | 
 | 223 | 	subq	$30,STACK,$30		# E : | 
 | 224 | 	.frame	$30,STACK,$23 | 
 | 225 | 	.prologue 0 | 
 | 226 | 	bis	$24,$25,$28		# E : | 
 | 227 | 	SLONGIFY($28)			# E : | 
 | 228 | 	bge	$28,7b			# U : | 
 | 229 |  | 
 | 230 | 	stq	$24,0($30)		# L : | 
 | 231 | 	subq	$31,$24,$28		# E : | 
 | 232 | 	stq	$25,8($30)		# L : | 
 | 233 | 	nop				# E : U L U L | 
 | 234 |  | 
 | 235 | 	cmovlt	$24,$28,$24	/* abs($24) */ # E : Latency 2, extra map slot | 
 | 236 | 	nop				# E : as part of the cmov | 
 | 237 | 	stq	$23,16($30)		# L : | 
 | 238 | 	subq	$31,$25,$28		# E : U L U L | 
 | 239 |  | 
 | 240 | 	stq	tmp1,24($30)		# L : | 
 | 241 | 	cmovlt	$25,$28,$25	/* abs($25) */ # E : Latency 2, extra map slot | 
 | 242 | 	nop				# E : | 
 | 243 | 	bsr	$23,ufunction		# L0: L U L U | 
 | 244 |  | 
 | 245 | 	ldq	$24,0($30)		# L : | 
 | 246 | 	ldq	$25,8($30)		# L : | 
 | 247 | 	GETSIGN($28)			# E : | 
 | 248 | 	subq	$31,$27,tmp1		# E : U U L L | 
 | 249 |  | 
 | 250 | 	SLONGIFY($28)			# E : | 
 | 251 | 	ldq	$23,16($30)		# L : | 
 | 252 | 	cmovlt	$28,tmp1,$27		# E : Latency 2, extra map slot | 
 | 253 | 	nop				# E : U L L U : as part of the cmov | 
 | 254 |  | 
 | 255 | 	ldq	tmp1,24($30)		# L : | 
 | 256 | 	nop				# E : as part of the cmov | 
 | 257 | 	addq	$30,STACK,$30		# E : | 
 | 258 | 	ret	$31,($23),1		# L0 : L U U L | 
 | 259 | 	.end	sfunction |