Introduce a new ARM header file <machine/cpu-features.h>

Introduce a new header file containing ARM-specific feature
test macros (e.g. __ARM_HAVE_PAIR_LOAD_STORE corresponding
to ldrd/strd instructions). Also modify a few files in our
system to use the macros in order to build for ARMv4T.
diff --git a/libc/arch-arm/bionic/memcpy.S b/libc/arch-arm/bionic/memcpy.S
index f6e4a7d..fcb58cd 100644
--- a/libc/arch-arm/bionic/memcpy.S
+++ b/libc/arch-arm/bionic/memcpy.S
@@ -25,6 +25,9 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
+
+#include <machine/cpu-features.h>
+
 	.text
 
     .global memcpy
@@ -52,9 +55,9 @@
 
         // preload the destination because we'll align it to a cache line
         // with small writes. Also start the source "pump".
-        pld         [r0, #0]
-        pld         [r1, #0]
-        pld         [r1, #32]
+        PLD         (r0, #0)
+        PLD         (r1, #0)
+        PLD         (r1, #32)
 
 		/* it simplifies things to take care of len<4 early */
 		cmp			r2, #4
@@ -141,8 +144,8 @@
         bic         r12, r1, #0x1F
         add         r12, r12, #64
 
-1:      ldmia		r1!, { r4-r11 }
-        pld         [r12, #64]
+1:      ldmia       r1!, { r4-r11 }
+        PLD         (r12, #64)
         subs        r2, r2, #32
 
         // NOTE: if r12 is more than 64 ahead of r1, the following ldrhi
@@ -263,8 +266,8 @@
         ldr         r12, [r1], #4
 1:      mov         r4, r12
 		ldmia		r1!, {   r5,r6,r7,  r8,r9,r10,r11}
-        pld         [r1, #64]
-		subs		r2, r2, #32
+        PLD         (r1, #64)
+        subs        r2, r2, #32
         ldrhs       r12, [r1], #4
 		orr			r3, r3, r4,		lsl #16
 		mov			r4, r4,			lsr #16
@@ -290,7 +293,7 @@
         ldr         r12, [r1], #4
 1:      mov         r4, r12
 		ldmia		r1!, {   r5,r6,r7,  r8,r9,r10,r11}
-        pld         [r1, #64]
+        PLD         (r1, #64)
 		subs		r2, r2, #32
         ldrhs       r12, [r1], #4
 		orr			r3, r3, r4,		lsl #24
@@ -317,7 +320,7 @@
         ldr         r12, [r1], #4
 1:      mov         r4, r12
 		ldmia		r1!, {   r5,r6,r7,  r8,r9,r10,r11}
-        pld         [r1, #64]
+        PLD         (r1, #64)
 		subs		r2, r2, #32
         ldrhs       r12, [r1], #4
 		orr			r3, r3, r4,		lsl #8