|  | /* Copyright 2002 Andi Kleen */ | 
|  |  | 
|  | #include <linux/linkage.h> | 
|  |  | 
|  | #include <asm/cpufeature.h> | 
|  | #include <asm/dwarf2.h> | 
|  |  | 
|  | /* | 
|  | * memcpy - Copy a memory block. | 
|  | * | 
|  | * Input: | 
|  | *  rdi destination | 
|  | *  rsi source | 
|  | *  rdx count | 
|  | * | 
|  | * Output: | 
|  | * rax original destination | 
|  | */ | 
|  |  | 
|  | /* | 
|  | * memcpy_c() - fast string ops (REP MOVSQ) based variant. | 
|  | * | 
|  | * Calls to this get patched into the kernel image via the | 
|  | * alternative instructions framework: | 
|  | */ | 
|  | ALIGN | 
|  | memcpy_c: | 
|  | CFI_STARTPROC | 
|  | movq %rdi, %rax | 
|  |  | 
|  | movl %edx, %ecx | 
|  | shrl $3, %ecx | 
|  | andl $7, %edx | 
|  | rep movsq | 
|  | movl %edx, %ecx | 
|  | rep movsb | 
|  | ret | 
|  | CFI_ENDPROC | 
|  | ENDPROC(memcpy_c) | 
|  |  | 
|  | ENTRY(__memcpy) | 
|  | ENTRY(memcpy) | 
|  | CFI_STARTPROC | 
|  |  | 
|  | /* | 
|  | * Put the number of full 64-byte blocks into %ecx. | 
|  | * Tail portion is handled at the end: | 
|  | */ | 
|  | movq %rdi, %rax | 
|  | movl %edx, %ecx | 
|  | shrl   $6, %ecx | 
|  | jz .Lhandle_tail | 
|  |  | 
|  | .p2align 4 | 
|  | .Lloop_64: | 
|  | /* | 
|  | * We decrement the loop index here - and the zero-flag is | 
|  | * checked at the end of the loop (instructions inbetween do | 
|  | * not change the zero flag): | 
|  | */ | 
|  | decl %ecx | 
|  |  | 
|  | /* | 
|  | * Move in blocks of 4x16 bytes: | 
|  | */ | 
|  | movq 0*8(%rsi),		%r11 | 
|  | movq 1*8(%rsi),		%r8 | 
|  | movq %r11,		0*8(%rdi) | 
|  | movq %r8,		1*8(%rdi) | 
|  |  | 
|  | movq 2*8(%rsi),		%r9 | 
|  | movq 3*8(%rsi),		%r10 | 
|  | movq %r9,		2*8(%rdi) | 
|  | movq %r10,		3*8(%rdi) | 
|  |  | 
|  | movq 4*8(%rsi),		%r11 | 
|  | movq 5*8(%rsi),		%r8 | 
|  | movq %r11,		4*8(%rdi) | 
|  | movq %r8,		5*8(%rdi) | 
|  |  | 
|  | movq 6*8(%rsi),		%r9 | 
|  | movq 7*8(%rsi),		%r10 | 
|  | movq %r9,		6*8(%rdi) | 
|  | movq %r10,		7*8(%rdi) | 
|  |  | 
|  | leaq 64(%rsi), %rsi | 
|  | leaq 64(%rdi), %rdi | 
|  |  | 
|  | jnz  .Lloop_64 | 
|  |  | 
|  | .Lhandle_tail: | 
|  | movl %edx, %ecx | 
|  | andl  $63, %ecx | 
|  | shrl   $3, %ecx | 
|  | jz   .Lhandle_7 | 
|  |  | 
|  | .p2align 4 | 
|  | .Lloop_8: | 
|  | decl %ecx | 
|  | movq (%rsi),		%r8 | 
|  | movq %r8,		(%rdi) | 
|  | leaq 8(%rdi),		%rdi | 
|  | leaq 8(%rsi),		%rsi | 
|  | jnz  .Lloop_8 | 
|  |  | 
|  | .Lhandle_7: | 
|  | movl %edx, %ecx | 
|  | andl $7, %ecx | 
|  | jz .Lend | 
|  |  | 
|  | .p2align 4 | 
|  | .Lloop_1: | 
|  | movb (%rsi), %r8b | 
|  | movb %r8b, (%rdi) | 
|  | incq %rdi | 
|  | incq %rsi | 
|  | decl %ecx | 
|  | jnz .Lloop_1 | 
|  |  | 
|  | .Lend: | 
|  | ret | 
|  | CFI_ENDPROC | 
|  | ENDPROC(memcpy) | 
|  | ENDPROC(__memcpy) | 
|  |  | 
|  | /* | 
|  | * Some CPUs run faster using the string copy instructions. | 
|  | * It is also a lot simpler. Use this when possible: | 
|  | */ | 
|  |  | 
|  | .section .altinstr_replacement, "ax" | 
|  | 1:	.byte 0xeb				/* jmp <disp8> */ | 
|  | .byte (memcpy_c - memcpy) - (2f - 1b)	/* offset */ | 
|  | 2: | 
|  | .previous | 
|  |  | 
|  | .section .altinstructions, "a" | 
|  | .align 8 | 
|  | .quad memcpy | 
|  | .quad 1b | 
|  | .byte X86_FEATURE_REP_GOOD | 
|  |  | 
|  | /* | 
|  | * Replace only beginning, memcpy is used to apply alternatives, | 
|  | * so it is silly to overwrite itself with nops - reboot is the | 
|  | * only outcome... | 
|  | */ | 
|  | .byte 2b - 1b | 
|  | .byte 2b - 1b | 
|  | .previous |