Remove pushes from memsets (krait/cortex-a9).

On the path that only uses r0 in both the krait and cortex-a9
memset, remove the push and use r3 instead.

In addition, for cortex-a9, remove the artificial function since
it's not needed since dwarf unwinding is now supported on arm.

Change-Id: Icd39cfb6b8350f44368e022063cd97a6b60d46da
diff --git a/libc/arch-arm/cortex-a9/bionic/memset.S b/libc/arch-arm/cortex-a9/bionic/memset.S
index 48ba815..b39fcc4 100644
--- a/libc/arch-arm/cortex-a9/bionic/memset.S
+++ b/libc/arch-arm/cortex-a9/bionic/memset.S
@@ -69,12 +69,9 @@
 ENTRY(memset)
         // The neon memset only wins for less than 132.
         cmp         r2, #132
-        bhi         __memset_large_copy
+        bhi         .L_memset_large_copy
 
-        stmfd       sp!, {r0}
-        .cfi_def_cfa_offset 4
-        .cfi_rel_offset r0, 0
-
+        mov         r3, r0
         vdup.8      q0, r1
 
         /* make sure we have at least 32 bytes to write */
@@ -84,7 +81,7 @@
 
 1:      /* The main loop writes 32 bytes at a time */
         subs        r2, r2, #32
-        vst1.8      {d0 - d3}, [r0]!
+        vst1.8      {d0 - d3}, [r3]!
         bhs         1b
 
 2:      /* less than 32 left */
@@ -93,22 +90,20 @@
         beq         3f
 
         // writes 16 bytes, 128-bits aligned
-        vst1.8      {d0, d1}, [r0]!
+        vst1.8      {d0, d1}, [r3]!
 3:      /* write up to 15-bytes (count in r2) */
         movs        ip, r2, lsl #29
         bcc         1f
-        vst1.8      {d0}, [r0]!
+        vst1.8      {d0}, [r3]!
 1:      bge         2f
-        vst1.32     {d0[0]}, [r0]!
+        vst1.32     {d0[0]}, [r3]!
 2:      movs        ip, r2, lsl #31
-        strbmi      r1, [r0], #1
-        strbcs      r1, [r0], #1
-        strbcs      r1, [r0], #1
-        ldmfd       sp!, {r0}
+        strbmi      r1, [r3], #1
+        strbcs      r1, [r3], #1
+        strbcs      r1, [r3], #1
         bx          lr
-END(memset)
 
-ENTRY_PRIVATE(__memset_large_copy)
+.L_memset_large_copy:
         /* compute the offset to align the destination
          * offset = (4-(src&3))&3 = -src & 3
          */
@@ -180,7 +175,7 @@
         movs        r2, r2, lsl #2
         strbcs      r1, [r0]
         ldmfd       sp!, {r0, r4-r7, pc}
-END(__memset_large_copy)
+END(memset)
 
         .data
 error_string:
diff --git a/libc/arch-arm/krait/bionic/memset.S b/libc/arch-arm/krait/bionic/memset.S
index a4fbe17..ae05965 100644
--- a/libc/arch-arm/krait/bionic/memset.S
+++ b/libc/arch-arm/krait/bionic/memset.S
@@ -69,10 +69,7 @@
 
 /* memset() returns its first argument.  */
 ENTRY(memset)
-        stmfd       sp!, {r0}
-        .cfi_def_cfa_offset 4
-        .cfi_rel_offset r0, 0
-
+        mov         r3, r0
         vdup.8      q0, r1
 
         /* make sure we have at least 32 bytes to write */
@@ -82,7 +79,7 @@
 
 1:      /* The main loop writes 32 bytes at a time */
         subs        r2, r2, #32
-        vst1.8      {d0 - d3}, [r0]!
+        vst1.8      {d0 - d3}, [r3]!
         bhs         1b
 
 2:      /* less than 32 left */
@@ -91,18 +88,17 @@
         beq         3f
 
         // writes 16 bytes, 128-bits aligned
-        vst1.8      {d0, d1}, [r0]!
+        vst1.8      {d0, d1}, [r3]!
 3:      /* write up to 15-bytes (count in r2) */
         movs        ip, r2, lsl #29
         bcc         1f
-        vst1.8      {d0}, [r0]!
+        vst1.8      {d0}, [r3]!
 1:      bge         2f
-        vst1.32     {d0[0]}, [r0]!
+        vst1.32     {d0[0]}, [r3]!
 2:      movs        ip, r2, lsl #31
-        strbmi      r1, [r0], #1
-        strbcs      r1, [r0], #1
-        strbcs      r1, [r0], #1
-        ldmfd       sp!, {r0}
+        strbmi      r1, [r3], #1
+        strbcs      r1, [r3], #1
+        strbcs      r1, [r3], #1
         bx          lr
 END(memset)