| Paul Mundt | 180ae20 | 2008-12-12 16:53:14 +0900 | [diff] [blame] | 1 | 	.section	.text..SHmedia32,"ax" | 
 | 2 | 	.align	2 | 
 | 3 | 	.global	__udivdi3 | 
 | 4 | __udivdi3: | 
 | 5 | 	shlri r3,1,r4 | 
 | 6 | 	nsb r4,r22 | 
 | 7 | 	shlld r3,r22,r6 | 
 | 8 | 	shlri r6,49,r5 | 
 | 9 | 	movi 0xffffffffffffbaf1,r21 /* .l shift count 17.  */ | 
 | 10 | 	sub r21,r5,r1 | 
 | 11 | 	mmulfx.w r1,r1,r4 | 
 | 12 | 	mshflo.w r1,r63,r1 | 
 | 13 | 	sub r63,r22,r20 // r63 == 64 % 64 | 
 | 14 | 	mmulfx.w r5,r4,r4 | 
 | 15 | 	pta large_divisor,tr0 | 
 | 16 | 	addi r20,32,r9 | 
 | 17 | 	msub.w r1,r4,r1 | 
 | 18 | 	madd.w r1,r1,r1 | 
 | 19 | 	mmulfx.w r1,r1,r4 | 
 | 20 | 	shlri r6,32,r7 | 
 | 21 | 	bgt/u r9,r63,tr0 // large_divisor | 
 | 22 | 	mmulfx.w r5,r4,r4 | 
 | 23 | 	shlri r2,32+14,r19 | 
 | 24 | 	addi r22,-31,r0 | 
 | 25 | 	msub.w r1,r4,r1 | 
 | 26 |  | 
 | 27 | 	mulu.l r1,r7,r4 | 
 | 28 | 	addi r1,-3,r5 | 
 | 29 | 	mulu.l r5,r19,r5 | 
 | 30 | 	sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2 | 
 | 31 | 	shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as | 
 | 32 | 	                 the case may be, %0000000000000000 000.11111111111, still */ | 
 | 33 | 	muls.l r1,r4,r4 /* leaving at least one sign bit.  */ | 
 | 34 | 	mulu.l r5,r3,r8 | 
 | 35 | 	mshalds.l r1,r21,r1 | 
 | 36 | 	shari r4,26,r4 | 
 | 37 | 	shlld r8,r0,r8 | 
 | 38 | 	add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5) | 
 | 39 | 	sub r2,r8,r2 | 
 | 40 | 	/* Can do second step of 64 : 32 div now, using r1 and the rest in r2.  */ | 
 | 41 |  | 
 | 42 | 	shlri r2,22,r21 | 
 | 43 | 	mulu.l r21,r1,r21 | 
 | 44 | 	shlld r5,r0,r8 | 
 | 45 | 	addi r20,30-22,r0 | 
 | 46 | 	shlrd r21,r0,r21 | 
 | 47 | 	mulu.l r21,r3,r5 | 
 | 48 | 	add r8,r21,r8 | 
 | 49 | 	mcmpgt.l r21,r63,r21 // See Note 1 | 
 | 50 | 	addi r20,30,r0 | 
 | 51 | 	mshfhi.l r63,r21,r21 | 
 | 52 | 	sub r2,r5,r2 | 
 | 53 | 	andc r2,r21,r2 | 
 | 54 |  | 
 | 55 | 	/* small divisor: need a third divide step */ | 
 | 56 | 	mulu.l r2,r1,r7 | 
 | 57 | 	ptabs r18,tr0 | 
 | 58 | 	addi r2,1,r2 | 
 | 59 | 	shlrd r7,r0,r7 | 
 | 60 | 	mulu.l r7,r3,r5 | 
 | 61 | 	add r8,r7,r8 | 
 | 62 | 	sub r2,r3,r2 | 
 | 63 | 	cmpgt r2,r5,r5 | 
 | 64 | 	add r8,r5,r2 | 
 | 65 | 	/* could test r3 here to check for divide by zero.  */ | 
 | 66 | 	blink tr0,r63 | 
 | 67 |  | 
 | 68 | large_divisor: | 
 | 69 | 	mmulfx.w r5,r4,r4 | 
 | 70 | 	shlrd r2,r9,r25 | 
 | 71 | 	shlri r25,32,r8 | 
 | 72 | 	msub.w r1,r4,r1 | 
 | 73 |  | 
 | 74 | 	mulu.l r1,r7,r4 | 
 | 75 | 	addi r1,-3,r5 | 
 | 76 | 	mulu.l r5,r8,r5 | 
 | 77 | 	sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2 | 
 | 78 | 	shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as | 
 | 79 | 	                 the case may be, %0000000000000000 000.11111111111, still */ | 
 | 80 | 	muls.l r1,r4,r4 /* leaving at least one sign bit.  */ | 
 | 81 | 	shlri r5,14-1,r8 | 
 | 82 | 	mulu.l r8,r7,r5 | 
 | 83 | 	mshalds.l r1,r21,r1 | 
 | 84 | 	shari r4,26,r4 | 
 | 85 | 	add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5) | 
 | 86 | 	sub r25,r5,r25 | 
 | 87 | 	/* Can do second step of 64 : 32 div now, using r1 and the rest in r25.  */ | 
 | 88 |  | 
 | 89 | 	shlri r25,22,r21 | 
 | 90 | 	mulu.l r21,r1,r21 | 
 | 91 | 	pta no_lo_adj,tr0 | 
 | 92 | 	addi r22,32,r0 | 
 | 93 | 	shlri r21,40,r21 | 
 | 94 | 	mulu.l r21,r7,r5 | 
 | 95 | 	add r8,r21,r8 | 
 | 96 | 	shlld r2,r0,r2 | 
 | 97 | 	sub r25,r5,r25 | 
 | 98 | 	bgtu/u r7,r25,tr0 // no_lo_adj | 
 | 99 | 	addi r8,1,r8 | 
 | 100 | 	sub r25,r7,r25 | 
 | 101 | no_lo_adj: | 
 | 102 | 	mextr4 r2,r25,r2 | 
 | 103 |  | 
 | 104 | 	/* large_divisor: only needs a few adjustments.  */ | 
 | 105 | 	mulu.l r8,r6,r5 | 
 | 106 | 	ptabs r18,tr0 | 
 | 107 | 	/* bubble */ | 
 | 108 | 	cmpgtu r5,r2,r5 | 
 | 109 | 	sub r8,r5,r2 | 
 | 110 | 	blink tr0,r63 | 
 | 111 | 	 | 
 | 112 | /* Note 1: To shift the result of the second divide stage so that the result | 
 | 113 |    always fits into 32 bits, yet we still reduce the rest sufficiently | 
 | 114 |    would require a lot of instructions to do the shifts just right.  Using | 
 | 115 |    the full 64 bit shift result to multiply with the divisor would require | 
 | 116 |    four extra instructions for the upper 32 bits (shift / mulu / shift / sub). | 
 | 117 |    Fortunately, if the upper 32 bits of the shift result are nonzero, we | 
 | 118 |    know that the rest after taking this partial result into account will | 
 | 119 |    fit into 32 bits.  So we just clear the upper 32 bits of the rest if the | 
 | 120 |    upper 32 bits of the partial result are nonzero.  */ |