[PATCH] x86_64: Remove optimization for B stepping AMD K8 B stepping were the first shipping Opterons. memcpy/memset/copy_page/ clear_page had special optimized version for them. These are really old and in the minority now and the difference to the generic versions (using rep microcode) is not that big anyways. So just remove them. TODO: figure out optimized versions for Intel Netburst based EM64T Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org>

commit: a5b250a428aabc619ace872f8220a7d0b8f7d557 [log] [tgz]
author: Andi Kleen <ak@suse.de> Sat Nov 05 17:25:54 2005 +0100
committer: Linus Torvalds <torvalds@g5.osdl.org> Mon Nov 14 19:55:17 2005 -0800
tree: 11cabf07982ae37f94bc929f9a605cbbd20e35ab
parent: a6f5deb2be4c82f24fefadcbf7e448f540c05ae6 [diff] [blame]
diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S
index c6c4649..92dd805 100644
--- a/arch/x86_64/lib/memcpy.S
+++ b/arch/x86_64/lib/memcpy.S

@@ -11,6 +11,8 @@
  * 
  * Output:
  * rax original destination
+ * 
+ * TODO: check best memcpy for PSC
  */	
 
  	.globl __memcpy
@@ -18,95 +20,6 @@
 	.p2align 4
 __memcpy:
 memcpy:		
-	pushq %rbx
-	movq %rdi,%rax
-
-	movl %edx,%ecx
-	shrl $6,%ecx
-	jz .Lhandle_tail
-	
-	.p2align 4
-.Lloop_64:
-	decl %ecx
-	
-	movq (%rsi),%r11
-	movq 8(%rsi),%r8
-
-	movq %r11,(%rdi)
-	movq %r8,1*8(%rdi)
-
-	movq 2*8(%rsi),%r9
-	movq 3*8(%rsi),%r10
-
-	movq %r9,2*8(%rdi)
-	movq %r10,3*8(%rdi)
-		
-	movq 4*8(%rsi),%r11
-	movq 5*8(%rsi),%r8
-
-	movq %r11,4*8(%rdi)
-	movq %r8,5*8(%rdi)
-
-	movq 6*8(%rsi),%r9
-	movq 7*8(%rsi),%r10
-
-	movq %r9,6*8(%rdi)
-	movq %r10,7*8(%rdi)
-
-	leaq 64(%rsi),%rsi
-	leaq 64(%rdi),%rdi
-	jnz  .Lloop_64
-
-.Lhandle_tail:
-	movl %edx,%ecx
-	andl $63,%ecx
-	shrl $3,%ecx
-	jz   .Lhandle_7
-	.p2align 4
-.Lloop_8: 
-	decl %ecx
-	movq (%rsi),%r8
-	movq %r8,(%rdi) 
-	leaq 8(%rdi),%rdi
-	leaq 8(%rsi),%rsi
-	jnz  .Lloop_8
-
-.Lhandle_7:
-	movl %edx,%ecx
-	andl $7,%ecx
-	jz .Lende
-	.p2align 4
-.Lloop_1:
-	movb (%rsi),%r8b
-	movb %r8b,(%rdi) 
-	incq %rdi
-	incq %rsi
-	decl %ecx
-	jnz .Lloop_1
-	
-.Lende: 	
-	popq %rbx
-	ret
-.Lfinal:
-	
-	/* C stepping K8 run faster using the string copy instructions.
-	   It is also a lot simpler. Use this when possible */
-	
-	.section .altinstructions,"a"
-	.align 8
-	.quad  memcpy
-	.quad  memcpy_c
-	.byte  X86_FEATURE_K8_C
-	.byte  .Lfinal-memcpy
-	.byte  memcpy_c_end-memcpy_c	
-	.previous
-
-	.section .altinstr_replacement,"ax"
- /* rdi	destination
-  * rsi source
-  * rdx count
-  */			
-memcpy_c:
 	movq %rdi,%rax
 	movl %edx,%ecx
 	shrl $3,%ecx
@@ -117,5 +30,3 @@
 	rep
 	movsb
 	ret
-memcpy_c_end:
-	.previous
commit	a5b250a428aabc619ace872f8220a7d0b8f7d557	[log] [tgz]
author	Andi Kleen <ak@suse.de>	Sat Nov 05 17:25:54 2005 +0100
committer	Linus Torvalds <torvalds@g5.osdl.org>	Mon Nov 14 19:55:17 2005 -0800
tree	11cabf07982ae37f94bc929f9a605cbbd20e35ab
parent	a6f5deb2be4c82f24fefadcbf7e448f540c05ae6 [diff] [blame]