| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 1 | /* | 
|  | 2 | * | 
|  | 3 | * Optimized version of the standard memcpy() function | 
|  | 4 | * | 
|  | 5 | * Inputs: | 
|  | 6 | * 	in0:	destination address | 
|  | 7 | *	in1:	source address | 
|  | 8 | *	in2:	number of bytes to copy | 
|  | 9 | * Output: | 
|  | 10 | * 	no return value | 
|  | 11 | * | 
|  | 12 | * Copyright (C) 2000-2001 Hewlett-Packard Co | 
|  | 13 | *	Stephane Eranian <eranian@hpl.hp.com> | 
|  | 14 | *	David Mosberger-Tang <davidm@hpl.hp.com> | 
|  | 15 | */ | 
|  | 16 | #include <asm/asmmacro.h> | 
|  | 17 |  | 
|  | 18 | GLOBAL_ENTRY(memcpy) | 
|  | 19 |  | 
|  | 20 | #	define MEM_LAT	21		/* latency to memory */ | 
|  | 21 |  | 
|  | 22 | #	define dst	r2 | 
|  | 23 | #	define src	r3 | 
|  | 24 | #	define retval	r8 | 
|  | 25 | #	define saved_pfs r9 | 
|  | 26 | #	define saved_lc	r10 | 
|  | 27 | #	define saved_pr	r11 | 
|  | 28 | #	define cnt	r16 | 
|  | 29 | #	define src2	r17 | 
|  | 30 | #	define t0	r18 | 
|  | 31 | #	define t1	r19 | 
|  | 32 | #	define t2	r20 | 
|  | 33 | #	define t3	r21 | 
|  | 34 | #	define t4	r22 | 
|  | 35 | #	define src_end	r23 | 
|  | 36 |  | 
|  | 37 | #	define N	(MEM_LAT + 4) | 
|  | 38 | #	define Nrot	((N + 7) & ~7) | 
|  | 39 |  | 
|  | 40 | /* | 
|  | 41 | * First, check if everything (src, dst, len) is a multiple of eight.  If | 
|  | 42 | * so, we handle everything with no taken branches (other than the loop | 
|  | 43 | * itself) and a small icache footprint.  Otherwise, we jump off to | 
|  | 44 | * the more general copy routine handling arbitrary | 
|  | 45 | * sizes/alignment etc. | 
|  | 46 | */ | 
|  | 47 | .prologue | 
|  | 48 | .save ar.pfs, saved_pfs | 
|  | 49 | alloc saved_pfs=ar.pfs,3,Nrot,0,Nrot | 
|  | 50 | .save ar.lc, saved_lc | 
|  | 51 | mov saved_lc=ar.lc | 
|  | 52 | or t0=in0,in1 | 
|  | 53 | ;; | 
|  | 54 |  | 
|  | 55 | or t0=t0,in2 | 
|  | 56 | .save pr, saved_pr | 
|  | 57 | mov saved_pr=pr | 
|  | 58 |  | 
|  | 59 | .body | 
|  | 60 |  | 
|  | 61 | cmp.eq p6,p0=in2,r0	// zero length? | 
|  | 62 | mov retval=in0		// return dst | 
|  | 63 | (p6)	br.ret.spnt.many rp	// zero length, return immediately | 
|  | 64 | ;; | 
|  | 65 |  | 
|  | 66 | mov dst=in0		// copy because of rotation | 
|  | 67 | shr.u cnt=in2,3		// number of 8-byte words to copy | 
|  | 68 | mov pr.rot=1<<16 | 
|  | 69 | ;; | 
|  | 70 |  | 
|  | 71 | adds cnt=-1,cnt		// br.ctop is repeat/until | 
|  | 72 | cmp.gtu p7,p0=16,in2	// copying less than 16 bytes? | 
|  | 73 | mov ar.ec=N | 
|  | 74 | ;; | 
|  | 75 |  | 
|  | 76 | and t0=0x7,t0 | 
|  | 77 | mov ar.lc=cnt | 
|  | 78 | ;; | 
|  | 79 | cmp.ne p6,p0=t0,r0 | 
|  | 80 |  | 
|  | 81 | mov src=in1		// copy because of rotation | 
|  | 82 | (p7)	br.cond.spnt.few .memcpy_short | 
|  | 83 | (p6)	br.cond.spnt.few .memcpy_long | 
|  | 84 | ;; | 
|  | 85 | nop.m	0 | 
|  | 86 | ;; | 
|  | 87 | nop.m	0 | 
|  | 88 | nop.i	0 | 
|  | 89 | ;; | 
|  | 90 | nop.m	0 | 
|  | 91 | ;; | 
|  | 92 | .rotr val[N] | 
|  | 93 | .rotp p[N] | 
|  | 94 | .align 32 | 
|  | 95 | 1: { .mib | 
|  | 96 | (p[0])	ld8 val[0]=[src],8 | 
|  | 97 | nop.i 0 | 
|  | 98 | brp.loop.imp 1b, 2f | 
|  | 99 | } | 
|  | 100 | 2: { .mfb | 
|  | 101 | (p[N-1])st8 [dst]=val[N-1],8 | 
|  | 102 | nop.f 0 | 
|  | 103 | br.ctop.dptk.few 1b | 
|  | 104 | } | 
|  | 105 | ;; | 
|  | 106 | mov ar.lc=saved_lc | 
|  | 107 | mov pr=saved_pr,-1 | 
|  | 108 | mov ar.pfs=saved_pfs | 
|  | 109 | br.ret.sptk.many rp | 
|  | 110 |  | 
|  | 111 | /* | 
|  | 112 | * Small (<16 bytes) unaligned copying is done via a simple byte-at-the-time | 
|  | 113 | * copy loop.  This performs relatively poorly on Itanium, but it doesn't | 
|  | 114 | * get used very often (gcc inlines small copies) and due to atomicity | 
|  | 115 | * issues, we want to avoid read-modify-write of entire words. | 
|  | 116 | */ | 
|  | 117 | .align 32 | 
|  | 118 | .memcpy_short: | 
|  | 119 | adds cnt=-1,in2		// br.ctop is repeat/until | 
|  | 120 | mov ar.ec=MEM_LAT | 
|  | 121 | brp.loop.imp 1f, 2f | 
|  | 122 | ;; | 
|  | 123 | mov ar.lc=cnt | 
|  | 124 | ;; | 
|  | 125 | nop.m	0 | 
|  | 126 | ;; | 
|  | 127 | nop.m	0 | 
|  | 128 | nop.i	0 | 
|  | 129 | ;; | 
|  | 130 | nop.m	0 | 
|  | 131 | ;; | 
|  | 132 | nop.m	0 | 
|  | 133 | ;; | 
|  | 134 | /* | 
|  | 135 | * It is faster to put a stop bit in the loop here because it makes | 
|  | 136 | * the pipeline shorter (and latency is what matters on short copies). | 
|  | 137 | */ | 
|  | 138 | .align 32 | 
|  | 139 | 1: { .mib | 
|  | 140 | (p[0])	ld1 val[0]=[src],1 | 
|  | 141 | nop.i 0 | 
|  | 142 | brp.loop.imp 1b, 2f | 
|  | 143 | } ;; | 
|  | 144 | 2: { .mfb | 
|  | 145 | (p[MEM_LAT-1])st1 [dst]=val[MEM_LAT-1],1 | 
|  | 146 | nop.f 0 | 
|  | 147 | br.ctop.dptk.few 1b | 
|  | 148 | } ;; | 
|  | 149 | mov ar.lc=saved_lc | 
|  | 150 | mov pr=saved_pr,-1 | 
|  | 151 | mov ar.pfs=saved_pfs | 
|  | 152 | br.ret.sptk.many rp | 
|  | 153 |  | 
|  | 154 | /* | 
|  | 155 | * Large (>= 16 bytes) copying is done in a fancy way.  Latency isn't | 
|  | 156 | * an overriding concern here, but throughput is.  We first do | 
|  | 157 | * sub-word copying until the destination is aligned, then we check | 
|  | 158 | * if the source is also aligned.  If so, we do a simple load/store-loop | 
|  | 159 | * until there are less than 8 bytes left over and then we do the tail, | 
|  | 160 | * by storing the last few bytes using sub-word copying.  If the source | 
|  | 161 | * is not aligned, we branch off to the non-congruent loop. | 
|  | 162 | * | 
|  | 163 | *   stage:   op: | 
|  | 164 | *         0  ld | 
|  | 165 | *	   : | 
|  | 166 | * MEM_LAT+3  shrp | 
|  | 167 | * MEM_LAT+4  st | 
|  | 168 | * | 
|  | 169 | * On Itanium, the pipeline itself runs without stalls.  However,  br.ctop | 
|  | 170 | * seems to introduce an unavoidable bubble in the pipeline so the overall | 
|  | 171 | * latency is 2 cycles/iteration.  This gives us a _copy_ throughput | 
|  | 172 | * of 4 byte/cycle.  Still not bad. | 
|  | 173 | */ | 
|  | 174 | #	undef N | 
|  | 175 | #	undef Nrot | 
|  | 176 | #	define N	(MEM_LAT + 5)		/* number of stages */ | 
|  | 177 | #	define Nrot	((N+1 + 2 + 7) & ~7)	/* number of rotating regs */ | 
|  | 178 |  | 
|  | 179 | #define LOG_LOOP_SIZE	6 | 
|  | 180 |  | 
|  | 181 | .memcpy_long: | 
|  | 182 | alloc t3=ar.pfs,3,Nrot,0,Nrot	// resize register frame | 
|  | 183 | and t0=-8,src		// t0 = src & ~7 | 
|  | 184 | and t2=7,src		// t2 = src & 7 | 
|  | 185 | ;; | 
|  | 186 | ld8 t0=[t0]		// t0 = 1st source word | 
|  | 187 | adds src2=7,src		// src2 = (src + 7) | 
|  | 188 | sub t4=r0,dst		// t4 = -dst | 
|  | 189 | ;; | 
|  | 190 | and src2=-8,src2	// src2 = (src + 7) & ~7 | 
|  | 191 | shl t2=t2,3		// t2 = 8*(src & 7) | 
|  | 192 | shl t4=t4,3		// t4 = 8*(dst & 7) | 
|  | 193 | ;; | 
|  | 194 | ld8 t1=[src2]		// t1 = 1st source word if src is 8-byte aligned, 2nd otherwise | 
|  | 195 | sub t3=64,t2		// t3 = 64-8*(src & 7) | 
|  | 196 | shr.u t0=t0,t2 | 
|  | 197 | ;; | 
|  | 198 | add src_end=src,in2 | 
|  | 199 | shl t1=t1,t3 | 
|  | 200 | mov pr=t4,0x38		// (p5,p4,p3)=(dst & 7) | 
|  | 201 | ;; | 
|  | 202 | or t0=t0,t1 | 
|  | 203 | mov cnt=r0 | 
|  | 204 | adds src_end=-1,src_end | 
|  | 205 | ;; | 
|  | 206 | (p3)	st1 [dst]=t0,1 | 
|  | 207 | (p3)	shr.u t0=t0,8 | 
|  | 208 | (p3)	adds cnt=1,cnt | 
|  | 209 | ;; | 
|  | 210 | (p4)	st2 [dst]=t0,2 | 
|  | 211 | (p4)	shr.u t0=t0,16 | 
|  | 212 | (p4)	adds cnt=2,cnt | 
|  | 213 | ;; | 
|  | 214 | (p5)	st4 [dst]=t0,4 | 
|  | 215 | (p5)	adds cnt=4,cnt | 
|  | 216 | and src_end=-8,src_end	// src_end = last word of source buffer | 
|  | 217 | ;; | 
|  | 218 |  | 
|  | 219 | // At this point, dst is aligned to 8 bytes and there at least 16-7=9 bytes left to copy: | 
|  | 220 |  | 
|  | 221 | 1:{	add src=cnt,src			// make src point to remainder of source buffer | 
|  | 222 | sub cnt=in2,cnt			// cnt = number of bytes left to copy | 
|  | 223 | mov t4=ip | 
|  | 224 | }	;; | 
|  | 225 | and src2=-8,src			// align source pointer | 
|  | 226 | adds t4=.memcpy_loops-1b,t4 | 
|  | 227 | mov ar.ec=N | 
|  | 228 |  | 
|  | 229 | and t0=7,src			// t0 = src & 7 | 
|  | 230 | shr.u t2=cnt,3			// t2 = number of 8-byte words left to copy | 
|  | 231 | shl cnt=cnt,3			// move bits 0-2 to 3-5 | 
|  | 232 | ;; | 
|  | 233 |  | 
|  | 234 | .rotr val[N+1], w[2] | 
|  | 235 | .rotp p[N] | 
|  | 236 |  | 
|  | 237 | cmp.ne p6,p0=t0,r0		// is src aligned, too? | 
|  | 238 | shl t0=t0,LOG_LOOP_SIZE		// t0 = 8*(src & 7) | 
|  | 239 | adds t2=-1,t2			// br.ctop is repeat/until | 
|  | 240 | ;; | 
|  | 241 | add t4=t0,t4 | 
|  | 242 | mov pr=cnt,0x38			// set (p5,p4,p3) to # of bytes last-word bytes to copy | 
|  | 243 | mov ar.lc=t2 | 
|  | 244 | ;; | 
|  | 245 | nop.m	0 | 
|  | 246 | ;; | 
|  | 247 | nop.m	0 | 
|  | 248 | nop.i	0 | 
|  | 249 | ;; | 
|  | 250 | nop.m	0 | 
|  | 251 | ;; | 
|  | 252 | (p6)	ld8 val[1]=[src2],8		// prime the pump... | 
|  | 253 | mov b6=t4 | 
|  | 254 | br.sptk.few b6 | 
|  | 255 | ;; | 
|  | 256 |  | 
|  | 257 | .memcpy_tail: | 
|  | 258 | // At this point, (p5,p4,p3) are set to the number of bytes left to copy (which is | 
|  | 259 | // less than 8) and t0 contains the last few bytes of the src buffer: | 
|  | 260 | (p5)	st4 [dst]=t0,4 | 
|  | 261 | (p5)	shr.u t0=t0,32 | 
|  | 262 | mov ar.lc=saved_lc | 
|  | 263 | ;; | 
|  | 264 | (p4)	st2 [dst]=t0,2 | 
|  | 265 | (p4)	shr.u t0=t0,16 | 
|  | 266 | mov ar.pfs=saved_pfs | 
|  | 267 | ;; | 
|  | 268 | (p3)	st1 [dst]=t0 | 
|  | 269 | mov pr=saved_pr,-1 | 
|  | 270 | br.ret.sptk.many rp | 
|  | 271 |  | 
|  | 272 | /////////////////////////////////////////////////////// | 
|  | 273 | .align 64 | 
|  | 274 |  | 
|  | 275 | #define COPY(shift,index)									\ | 
|  | 276 | 1: { .mib											\ | 
|  | 277 | (p[0])		ld8 val[0]=[src2],8;							\ | 
|  | 278 | (p[MEM_LAT+3])	shrp w[0]=val[MEM_LAT+3],val[MEM_LAT+4-index],shift;			\ | 
|  | 279 | brp.loop.imp 1b, 2f							\ | 
|  | 280 | };												\ | 
|  | 281 | 2: { .mfb											\ | 
|  | 282 | (p[MEM_LAT+4])	st8 [dst]=w[1],8;							\ | 
|  | 283 | nop.f 0;								\ | 
|  | 284 | br.ctop.dptk.few 1b;							\ | 
|  | 285 | };												\ | 
|  | 286 | ;;									\ | 
|  | 287 | ld8 val[N-1]=[src_end];	/* load last word (may be same as val[N]) */	\ | 
|  | 288 | ;;									\ | 
|  | 289 | shrp t0=val[N-1],val[N-index],shift;					\ | 
|  | 290 | br .memcpy_tail | 
|  | 291 | .memcpy_loops: | 
|  | 292 | COPY(0, 1) /* no point special casing this---it doesn't go any faster without shrp */ | 
|  | 293 | COPY(8, 0) | 
|  | 294 | COPY(16, 0) | 
|  | 295 | COPY(24, 0) | 
|  | 296 | COPY(32, 0) | 
|  | 297 | COPY(40, 0) | 
|  | 298 | COPY(48, 0) | 
|  | 299 | COPY(56, 0) | 
|  | 300 |  | 
|  | 301 | END(memcpy) |