Modify Android mem* routines with CodeAurora versions. Update the memcpy, memmove, and memset routines to use the versions from CodeAurora when specified in the bionic/Android.mk file (actually activated in the BoardConfig.mk file under device/<vendor>/<board>). With this change, the mem* routines are only used for the msm8660, while other platforms will use the current Android mem* routines. Future platforms can modify the makefile to use the CodeAurora-based mem* routines as desired. This has the benefit of making the CodeAurora- based routines opt-in instead of opt-out. Also, PLDSIZE and PLDOFFS can be specified in the BoardConfig.mk as well, so other platforms with different PLD tunings can use the same code without modifying the source file itself. Tests with FileCycler-0.3 showed a slight 1.1% improvement with these files on an 8660v2, based on the average of three FileCycler runs with and without the patch. Since the min/max values did not overlap, and the average score showed an improvement, we can consider upstreaming these modifications. Change-Id: I6946076bc6a88a2a2c8667b09494e1eb31e01ee0 Conflicts: libc/Android.mk Signed-off-by: Andrew Sutherland <dr3wsuth3rland@gmail.com>

commit: c822147d6d71b161bae3421aaf434e73b102a2f1 [log] [tgz]
author: Harshad Bhutada <hbhutada@codeaurora.org> Thu May 05 18:27:02 2011 +0530
committer: Andrew Sutherland <dr3wsuth3rland@gmail.com> Sun Apr 01 07:37:39 2012 -0500
tree: e7ea140214056fcd3d473e5be7cb103899312414
parent: 0372d0fc8ee6a23a6bd0b62a545578d513fc6d67 [diff] [blame]
diff --git a/libc/arch-arm/bionic/memcpy.S b/libc/arch-arm/bionic/memcpy.S
index 438fa00..90e788a 100644
--- a/libc/arch-arm/bionic/memcpy.S
+++ b/libc/arch-arm/bionic/memcpy.S

@@ -2,6 +2,8 @@
  * Copyright (C) 2008 The Android Open Source Project
  * All rights reserved.
  *
+ * Copyright (c) 2009-2011, Code Aurora Forum. All rights reserved.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -30,7 +32,114 @@
 #include <machine/asm.h>
 
 #if defined(__ARM_NEON__)
-
+#if defined(SCORPION_NEON_OPTIMIZATION)
+	/*
+	 * These can be overridden in:
+	 *   device/<vendor>/<board>/BoardConfig.mk
+         * by setting the following:
+	 *   TARGET_USE_SCORPION_BIONIC_OPTIMIZATION := true
+	 *   TARGET_USE_SCORPION_PLD_SET := true
+	 *   TARGET_SCORPION_BIONIC_PLDOFFS := <pldoffset>
+	 *   TARGET_SCORPION_BIONIC_PLDSIZE := <pldsize>
+	 */
+#ifndef PLDOFFS
+#define PLDOFFS	(6)
+#endif
+#ifndef PLDSIZE
+#define PLDSIZE	(128)	/* L2 cache line size */
+#endif
+        .code 32
+        .align 5
+        .globl memcpy
+        .func
+memcpy:
+	push            {r0}
+	cmp             r2, #4
+	blt             .Lneon_lt4
+	cmp             r2, #16
+	blt             .Lneon_lt16
+	cmp             r2, #32
+	blt             .Lneon_16
+	cmp              r2, #128
+	blt              .Lneon_copy_32_a
+	/* Copy blocks of 128-bytes (word-aligned) at a time*/
+	/* Code below is optimized for PLDSIZE=128 only */
+	mov             r12, r2, lsr #7
+	cmp             r12, #PLDOFFS
+	ble             .Lneon_copy_128_loop_nopld
+	sub             r12, #PLDOFFS
+	pld             [r1, #(PLDOFFS-1)*PLDSIZE]
+.Lneon_copy_128_loop_outer:
+	pld             [r1, #(PLDOFFS*PLDSIZE)]
+	vld1.32         {q0, q1}, [r1]!
+	vld1.32         {q2, q3}, [r1]!
+	vld1.32         {q8, q9}, [r1]!
+	vld1.32         {q10, q11}, [r1]!
+	subs            r12, r12, #1
+	vst1.32	        {q0, q1}, [r0]!
+	vst1.32         {q2, q3}, [r0]!
+	vst1.32         {q8, q9}, [r0]!
+	vst1.32         {q10, q11}, [r0]!
+	bne             .Lneon_copy_128_loop_outer
+	mov             r12, #PLDOFFS
+.Lneon_copy_128_loop_nopld:
+	vld1.32         {q0, q1}, [r1]!
+	vld1.32         {q2, q3}, [r1]!
+	vld1.32         {q8, q9}, [r1]!
+	vld1.32         {q10, q11}, [r1]!
+	subs            r12, r12, #1
+	vst1.32         {q0, q1}, [r0]!
+	vst1.32         {q2, q3}, [r0]!
+	vst1.32         {q8, q9}, [r0]!
+	vst1.32         {q10, q11}, [r0]!
+	bne             .Lneon_copy_128_loop_nopld
+	ands            r2, r2, #0x7f
+	beq             .Lneon_exit
+	cmp             r2, #32
+	blt             .Lneon_16
+	nop
+	/* Copy blocks of 32-bytes (word aligned) at a time*/
+.Lneon_copy_32_a:
+	mov             r12, r2, lsr #5
+.Lneon_copy_32_loop_a:
+	vld1.32         {q0,q1}, [r1]!
+	subs            r12, r12, #1
+	vst1.32         {q0,q1}, [r0]!
+	bne             .Lneon_copy_32_loop_a
+	ands            r2, r2, #0x1f
+	beq             .Lneon_exit
+.Lneon_16:
+	subs            r2, r2, #16
+	blt             .Lneon_lt16
+	vld1.32         {q8}, [r1]!
+	vst1.32         {q8}, [r0]!
+	beq             .Lneon_exit
+.Lneon_lt16:
+	movs            r12, r2, lsl #29
+	bcc             .Lneon_skip8
+	ldr             r3, [r1], #4
+	ldr             r12, [r1], #4
+	str             r3, [r0], #4
+	str             r12, [r0], #4
+.Lneon_skip8:
+	bpl             .Lneon_lt4
+	ldr             r3, [r1], #4
+	str             r3, [r0], #4
+.Lneon_lt4:
+	movs            r2, r2, lsl #31
+	bcc             .Lneon_lt2
+	ldrh            r3, [r1], #2
+	strh            r3, [r0], #2
+.Lneon_lt2:
+	bpl             .Lneon_exit
+	ldrb            r12, [r1]
+	strb            r12, [r0]
+.Lneon_exit:
+	pop             {r0}
+	bx              lr
+	.endfunc
+	.end
+#else /* !SCORPION_NEON_OPTIMIZATION */
         .text
         .fpu    neon
 
@@ -141,7 +250,7 @@
         bx          lr
 END(memcpy)
 
-
+#endif  /* !SCORPION_NEON_OPTIMIZATION */
 #else   /* __ARM_ARCH__ < 7 */
commit	c822147d6d71b161bae3421aaf434e73b102a2f1	[log] [tgz]
author	Harshad Bhutada <hbhutada@codeaurora.org>	Thu May 05 18:27:02 2011 +0530
committer	Andrew Sutherland <dr3wsuth3rland@gmail.com>	Sun Apr 01 07:37:39 2012 -0500
tree	e7ea140214056fcd3d473e5be7cb103899312414
parent	0372d0fc8ee6a23a6bd0b62a545578d513fc6d67 [diff] [blame]