|  | /* Copyright 2002 Andi Kleen, SuSE Labs */ | 
|  |  | 
|  | #include <linux/linkage.h> | 
|  | #include <asm/dwarf2.h> | 
|  |  | 
|  | /* | 
|  | * ISO C memset - set a memory block to a byte value. | 
|  | * | 
|  | * rdi   destination | 
|  | * rsi   value (char) | 
|  | * rdx   count (bytes) | 
|  | * | 
|  | * rax   original destination | 
|  | */ | 
|  | .section .altinstr_replacement, "ax", @progbits | 
|  | .Lmemset_c: | 
|  | movq %rdi,%r9 | 
|  | movl %edx,%r8d | 
|  | andl $7,%r8d | 
|  | movl %edx,%ecx | 
|  | shrl $3,%ecx | 
|  | /* expand byte value  */ | 
|  | movzbl %sil,%esi | 
|  | movabs $0x0101010101010101,%rax | 
|  | mulq %rsi		/* with rax, clobbers rdx */ | 
|  | rep stosq | 
|  | movl %r8d,%ecx | 
|  | rep stosb | 
|  | movq %r9,%rax | 
|  | ret | 
|  | .Lmemset_e: | 
|  | .previous | 
|  |  | 
|  | ENTRY(memset) | 
|  | ENTRY(__memset) | 
|  | CFI_STARTPROC | 
|  | movq %rdi,%r10 | 
|  | movq %rdx,%r11 | 
|  |  | 
|  | /* expand byte value  */ | 
|  | movzbl %sil,%ecx | 
|  | movabs $0x0101010101010101,%rax | 
|  | mul    %rcx		/* with rax, clobbers rdx */ | 
|  |  | 
|  | /* align dst */ | 
|  | movl  %edi,%r9d | 
|  | andl  $7,%r9d | 
|  | jnz  .Lbad_alignment | 
|  | CFI_REMEMBER_STATE | 
|  | .Lafter_bad_alignment: | 
|  |  | 
|  | movl %r11d,%ecx | 
|  | shrl $6,%ecx | 
|  | jz	 .Lhandle_tail | 
|  |  | 
|  | .p2align 4 | 
|  | .Lloop_64: | 
|  | decl   %ecx | 
|  | movq  %rax,(%rdi) | 
|  | movq  %rax,8(%rdi) | 
|  | movq  %rax,16(%rdi) | 
|  | movq  %rax,24(%rdi) | 
|  | movq  %rax,32(%rdi) | 
|  | movq  %rax,40(%rdi) | 
|  | movq  %rax,48(%rdi) | 
|  | movq  %rax,56(%rdi) | 
|  | leaq  64(%rdi),%rdi | 
|  | jnz    .Lloop_64 | 
|  |  | 
|  | /* Handle tail in loops. The loops should be faster than hard | 
|  | to predict jump tables. */ | 
|  | .p2align 4 | 
|  | .Lhandle_tail: | 
|  | movl	%r11d,%ecx | 
|  | andl    $63&(~7),%ecx | 
|  | jz 		.Lhandle_7 | 
|  | shrl	$3,%ecx | 
|  | .p2align 4 | 
|  | .Lloop_8: | 
|  | decl   %ecx | 
|  | movq  %rax,(%rdi) | 
|  | leaq  8(%rdi),%rdi | 
|  | jnz    .Lloop_8 | 
|  |  | 
|  | .Lhandle_7: | 
|  | movl	%r11d,%ecx | 
|  | andl	$7,%ecx | 
|  | jz      .Lende | 
|  | .p2align 4 | 
|  | .Lloop_1: | 
|  | decl    %ecx | 
|  | movb 	%al,(%rdi) | 
|  | leaq	1(%rdi),%rdi | 
|  | jnz     .Lloop_1 | 
|  |  | 
|  | .Lende: | 
|  | movq	%r10,%rax | 
|  | ret | 
|  |  | 
|  | CFI_RESTORE_STATE | 
|  | .Lbad_alignment: | 
|  | cmpq $7,%r11 | 
|  | jbe	.Lhandle_7 | 
|  | movq %rax,(%rdi)	/* unaligned store */ | 
|  | movq $8,%r8 | 
|  | subq %r9,%r8 | 
|  | addq %r8,%rdi | 
|  | subq %r8,%r11 | 
|  | jmp .Lafter_bad_alignment | 
|  | .Lfinal: | 
|  | CFI_ENDPROC | 
|  | ENDPROC(memset) | 
|  | ENDPROC(__memset) | 
|  |  | 
|  | /* Some CPUs run faster using the string instructions. | 
|  | It is also a lot simpler. Use this when possible */ | 
|  |  | 
|  | #include <asm/cpufeature.h> | 
|  |  | 
|  | .section .altinstructions,"a" | 
|  | .align 8 | 
|  | .quad memset | 
|  | .quad .Lmemset_c | 
|  | .word X86_FEATURE_REP_GOOD | 
|  | .byte .Lfinal - memset | 
|  | .byte .Lmemset_e - .Lmemset_c | 
|  | .previous |