| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 1 | /* | 
 | 2 |  * arch/alpha/lib/ev6-copy_page.S | 
 | 3 |  * | 
 | 4 |  * Copy an entire page. | 
 | 5 |  */ | 
 | 6 |  | 
 | 7 | /* The following comparison of this routine vs the normal copy_page.S | 
 | 8 |    was written by an unnamed ev6 hardware designer and forwarded to me | 
 | 9 |    via Steven Hobbs <hobbs@steven.zko.dec.com>. | 
 | 10 |   | 
 | 11 |    First Problem: STQ overflows. | 
 | 12 |    ----------------------------- | 
 | 13 |  | 
 | 14 | 	It would be nice if EV6 handled every resource overflow efficiently, | 
 | 15 | 	but for some it doesn't.  Including store queue overflows.  It causes | 
 | 16 | 	a trap and a restart of the pipe. | 
 | 17 |  | 
 | 18 | 	To get around this we sometimes use (to borrow a term from a VSSAD | 
 | 19 | 	researcher) "aeration".  The idea is to slow the rate at which the | 
 | 20 | 	processor receives valid instructions by inserting nops in the fetch | 
 | 21 | 	path.  In doing so, you can prevent the overflow and actually make | 
 | 22 | 	the code run faster.  You can, of course, take advantage of the fact | 
 | 23 | 	that the processor can fetch at most 4 aligned instructions per cycle. | 
 | 24 |  | 
 | 25 | 	I inserted enough nops to force it to take 10 cycles to fetch the | 
 | 26 | 	loop code.  In theory, EV6 should be able to execute this loop in | 
 | 27 | 	9 cycles but I was not able to get it to run that fast -- the initial | 
 | 28 | 	conditions were such that I could not reach this optimum rate on | 
 | 29 | 	(chaotic) EV6.  I wrote the code such that everything would issue | 
 | 30 | 	in order.  | 
 | 31 |  | 
 | 32 |    Second Problem: Dcache index matches. | 
 | 33 |    ------------------------------------- | 
 | 34 |  | 
 | 35 | 	If you are going to use this routine on random aligned pages, there | 
 | 36 | 	is a 25% chance that the pages will be at the same dcache indices. | 
 | 37 | 	This results in many nasty memory traps without care. | 
 | 38 |  | 
 | 39 | 	The solution is to schedule the prefetches to avoid the memory | 
 | 40 | 	conflicts.  I schedule the wh64 prefetches farther ahead of the | 
 | 41 | 	read prefetches to avoid this problem. | 
 | 42 |  | 
 | 43 |    Third Problem: Needs more prefetching. | 
 | 44 |    -------------------------------------- | 
 | 45 |  | 
 | 46 | 	In order to improve the code I added deeper prefetching to take the | 
 | 47 | 	most advantage of EV6's bandwidth. | 
 | 48 |  | 
 | 49 | 	I also prefetched the read stream. Note that adding the read prefetch | 
 | 50 | 	forced me to add another cycle to the inner-most kernel - up to 11 | 
 | 51 | 	from the original 8 cycles per iteration.  We could improve performance | 
 | 52 | 	further by unrolling the loop and doing multiple prefetches per cycle. | 
 | 53 |  | 
 | 54 |    I think that the code below will be very robust and fast code for the | 
 | 55 |    purposes of copying aligned pages.  It is slower when both source and | 
 | 56 |    destination pages are in the dcache, but it is my guess that this is | 
 | 57 |    less important than the dcache miss case.  */ | 
 | 58 |  | 
 | 59 |  | 
 | 60 | 	.text | 
 | 61 | 	.align 4 | 
 | 62 | 	.global copy_page | 
 | 63 | 	.ent copy_page | 
 | 64 | copy_page: | 
 | 65 | 	.prologue 0 | 
 | 66 |  | 
 | 67 | 	/* Prefetch 5 read cachelines; write-hint 10 cache lines.  */ | 
 | 68 | 	wh64	($16) | 
 | 69 | 	ldl	$31,0($17) | 
 | 70 | 	ldl	$31,64($17) | 
 | 71 | 	lda	$1,1*64($16) | 
 | 72 |  | 
 | 73 | 	wh64	($1) | 
 | 74 | 	ldl	$31,128($17) | 
 | 75 | 	ldl	$31,192($17) | 
 | 76 | 	lda	$1,2*64($16) | 
 | 77 |  | 
 | 78 | 	wh64	($1) | 
 | 79 | 	ldl	$31,256($17) | 
 | 80 | 	lda	$18,118 | 
 | 81 | 	lda	$1,3*64($16) | 
 | 82 |  | 
 | 83 | 	wh64	($1) | 
 | 84 | 	nop | 
 | 85 | 	lda	$1,4*64($16) | 
 | 86 | 	lda	$2,5*64($16) | 
 | 87 |  | 
 | 88 | 	wh64	($1) | 
 | 89 | 	wh64	($2) | 
 | 90 | 	lda	$1,6*64($16) | 
 | 91 | 	lda	$2,7*64($16) | 
 | 92 |  | 
 | 93 | 	wh64	($1) | 
 | 94 | 	wh64	($2) | 
 | 95 | 	lda	$1,8*64($16) | 
 | 96 | 	lda	$2,9*64($16) | 
 | 97 |  | 
 | 98 | 	wh64	($1) | 
 | 99 | 	wh64	($2) | 
 | 100 | 	lda	$19,10*64($16) | 
 | 101 | 	nop | 
 | 102 |  | 
 | 103 | 	/* Main prefetching/write-hinting loop.  */ | 
 | 104 | 1:	ldq	$0,0($17) | 
 | 105 | 	ldq	$1,8($17) | 
 | 106 | 	unop | 
 | 107 | 	unop | 
 | 108 |  | 
 | 109 | 	unop | 
 | 110 | 	unop | 
 | 111 | 	ldq	$2,16($17) | 
 | 112 | 	ldq	$3,24($17) | 
 | 113 |  | 
 | 114 | 	ldq	$4,32($17) | 
 | 115 | 	ldq	$5,40($17) | 
 | 116 | 	unop | 
 | 117 | 	unop | 
 | 118 |  | 
 | 119 | 	unop | 
 | 120 | 	unop | 
 | 121 | 	ldq	$6,48($17) | 
 | 122 | 	ldq	$7,56($17) | 
 | 123 |  | 
 | 124 | 	ldl	$31,320($17) | 
 | 125 | 	unop | 
 | 126 | 	unop | 
 | 127 | 	unop | 
 | 128 |  | 
 | 129 | 	/* This gives the extra cycle of aeration above the minimum.  */ | 
 | 130 | 	unop			 | 
 | 131 | 	unop | 
 | 132 | 	unop | 
 | 133 | 	unop | 
 | 134 |  | 
 | 135 | 	wh64	($19) | 
 | 136 | 	unop | 
 | 137 | 	unop | 
 | 138 | 	unop | 
 | 139 |  | 
 | 140 | 	stq	$0,0($16) | 
 | 141 | 	subq	$18,1,$18 | 
 | 142 | 	stq	$1,8($16) | 
 | 143 | 	unop | 
 | 144 |  | 
 | 145 | 	unop | 
 | 146 | 	stq	$2,16($16) | 
 | 147 | 	addq	$17,64,$17 | 
 | 148 | 	stq	$3,24($16) | 
 | 149 |  | 
 | 150 | 	stq	$4,32($16) | 
 | 151 | 	stq	$5,40($16) | 
 | 152 | 	addq	$19,64,$19 | 
 | 153 | 	unop | 
 | 154 |  | 
 | 155 | 	stq	$6,48($16) | 
 | 156 | 	stq	$7,56($16) | 
 | 157 | 	addq	$16,64,$16 | 
 | 158 | 	bne	$18, 1b | 
 | 159 |  | 
 | 160 | 	/* Prefetch the final 5 cache lines of the read stream.  */ | 
 | 161 | 	lda	$18,10 | 
 | 162 | 	ldl	$31,320($17) | 
 | 163 | 	ldl	$31,384($17) | 
 | 164 | 	ldl	$31,448($17) | 
 | 165 |  | 
 | 166 | 	ldl	$31,512($17) | 
 | 167 | 	ldl	$31,576($17) | 
 | 168 | 	nop | 
 | 169 | 	nop | 
 | 170 |  | 
 | 171 | 	/* Non-prefetching, non-write-hinting cleanup loop for the | 
 | 172 | 	   final 10 cache lines.  */ | 
 | 173 | 2:	ldq	$0,0($17) | 
 | 174 | 	ldq	$1,8($17) | 
 | 175 | 	ldq	$2,16($17) | 
 | 176 | 	ldq	$3,24($17) | 
 | 177 |  | 
 | 178 | 	ldq	$4,32($17) | 
 | 179 | 	ldq	$5,40($17) | 
 | 180 | 	ldq	$6,48($17) | 
 | 181 | 	ldq	$7,56($17) | 
 | 182 |  | 
 | 183 | 	stq	$0,0($16) | 
 | 184 | 	subq	$18,1,$18 | 
 | 185 | 	stq	$1,8($16) | 
 | 186 | 	addq	$17,64,$17 | 
 | 187 |  | 
 | 188 | 	stq	$2,16($16) | 
 | 189 | 	stq	$3,24($16) | 
 | 190 | 	stq	$4,32($16) | 
 | 191 | 	stq	$5,40($16) | 
 | 192 |  | 
 | 193 | 	stq	$6,48($16) | 
 | 194 | 	stq	$7,56($16) | 
 | 195 | 	addq	$16,64,$16 | 
 | 196 | 	bne	$18, 2b | 
 | 197 |  | 
 | 198 | 	ret | 
 | 199 | 	nop | 
 | 200 | 	unop | 
 | 201 | 	nop | 
 | 202 |  | 
 | 203 | 	.end copy_page |