Merge tag 'android-7.1.1_r28' into HEAD

Android 7.1.1 release 28
diff --git a/libc/Android.bp b/libc/Android.bp
index 4cae16c..c312fc1 100644
--- a/libc/Android.bp
+++ b/libc/Android.bp
@@ -687,6 +687,13 @@
                     "upstream-openbsd/lib/libc/string/strcat.c",
                 ],
             },
+            scorpion: {
+                exclude_srcs: [
+                    "upstream-openbsd/lib/libc/string/memmove.c",
+                    "upstream-openbsd/lib/libc/string/stpcpy.c",
+                    "upstream-openbsd/lib/libc/string/strcat.c",
+                ],
+            },
         },
         arm64: {
             exclude_srcs: [
@@ -1062,6 +1069,33 @@
                     "bionic/__strcpy_chk.cpp",
                 ],
             },
+            scorpion: {
+                srcs: [
+                    // Use krait memset/strcmp/memmove.
+                    "arch-arm/krait/bionic/memset.S",
+                    "arch-arm/krait/bionic/strcmp.S",
+
+                    // Use cortex-a15 versions of strcat/strcpy/strlen.
+                    "arch-arm/cortex-a15/bionic/memcpy.S",
+                    "arch-arm/cortex-a15/bionic/stpcpy.S",
+                    "arch-arm/cortex-a15/bionic/strcat.S",
+                    "arch-arm/cortex-a15/bionic/__strcat_chk.S",
+                    "arch-arm/cortex-a15/bionic/strcpy.S",
+                    "arch-arm/cortex-a15/bionic/__strcpy_chk.S",
+                    "arch-arm/cortex-a15/bionic/strlen.S",
+
+                    "arch-arm/denver/bionic/memmove.S",
+                ],
+                exclude_srcs: [
+                    "arch-arm/generic/bionic/memcpy.S",
+                    "arch-arm/generic/bionic/memset.S",
+                    "arch-arm/generic/bionic/strcmp.S",
+                    "arch-arm/generic/bionic/strcpy.S",
+                    "arch-arm/generic/bionic/strlen.c",
+                    "bionic/__strcat_chk.cpp",
+                    "bionic/__strcpy_chk.cpp",
+                ],
+            },
 
         },
         arm64: {
diff --git a/libc/Android.mk b/libc/Android.mk
index 1ca84c1..9568b4b 100644
--- a/libc/Android.mk
+++ b/libc/Android.mk
@@ -621,6 +621,10 @@
   use_clang := false
 endif
 
+ifeq ($(TARGET_NEEDS_GCC_LIBC),true)
+  use_clang := false
+endif
+
 ifeq ($(use_clang),)
   use_clang := true
 endif
@@ -639,6 +643,10 @@
 libc_malloc_src := bionic/jemalloc_wrapper.cpp
 libc_common_c_includes += external/jemalloc/include
 
+ifeq ($(BOARD_USES_LEGACY_MMAP),true)
+  libc_common_cflags += -DLEGACY_MMAP
+endif
+
 # Define some common conlyflags
 libc_common_conlyflags := \
     -std=gnu99
@@ -1449,6 +1457,9 @@
 LOCAL_SANITIZE := never
 LOCAL_NATIVE_COVERAGE := $(bionic_coverage)
 
+# Allow devices to provide additional symbols
+LOCAL_WHOLE_STATIC_LIBRARIES += $(BOARD_PROVIDES_ADDITIONAL_BIONIC_STATIC_LIBS)
+
 include $(BUILD_SHARED_LIBRARY)
 
 # ========================================================
diff --git a/libc/arch-arm/arm.mk b/libc/arch-arm/arm.mk
index 76f465e..3a12558 100644
--- a/libc/arch-arm/arm.mk
+++ b/libc/arch-arm/arm.mk
@@ -1,6 +1,7 @@
 # 32-bit arm.
 
 libc_bionic_src_files_arm += \
+    arch-arm/generic/bionic/memchr.S \
     arch-arm/generic/bionic/memcmp.S \
     arch-arm/generic/bionic/memcpy.S \
     arch-arm/generic/bionic/memset.S \
@@ -13,7 +14,8 @@
     bionic/__memset_chk.cpp \
 
 libc_openbsd_src_files_exclude_arm += \
-    upstream-openbsd/lib/libc/string/strcpy.c \
+    upstream-openbsd/lib/libc/string/memchr.c \
+    upstream-openbsd/lib/libc/string/strcpy.c
 
 #
 # Inherently architecture-specific code.
@@ -41,7 +43,7 @@
 ifneq ($(TARGET_$(my_2nd_arch_prefix)CPU_VARIANT),generic)
 cpu_variant_mk := $(LOCAL_PATH)/arch-arm/$(TARGET_$(my_2nd_arch_prefix)CPU_VARIANT)/$(TARGET_$(my_2nd_arch_prefix)CPU_VARIANT).mk
 ifeq ($(wildcard $(cpu_variant_mk)),)
-$(error "TARGET_$(my_2nd_arch_prefix)CPU_VARIANT not set or set to an unknown value. Possible values are cortex-a7, cortex-a8, cortex-a9, cortex-a15, krait, denver. Use generic for devices that do not have a CPU similar to any of the supported cpu variants.")
+$(error "TARGET_$(my_2nd_arch_prefix)CPU_VARIANT not set or set to an unknown value. Possible values are cortex-a7, cortex-a8, cortex-a9, cortex-a15, krait, scorpion, denver. Use generic for devices that do not have a CPU similar to any of the supported cpu variants.")
 endif
 include $(cpu_variant_mk)
 libc_common_additional_dependencies += $(cpu_variant_mk)
diff --git a/libc/arch-arm/generic/bionic/memchr.S b/libc/arch-arm/generic/bionic/memchr.S
new file mode 100644
index 0000000..cb00d82
--- /dev/null
+++ b/libc/arch-arm/generic/bionic/memchr.S
@@ -0,0 +1,155 @@
+/* Copyright (c) 2010-2015, Linaro Limited
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+      * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+      * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+      * Neither the name of Linaro Limited nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+   Written by Dave Gilbert <david.gilbert@linaro.org>
+
+   This memchr routine is optimised on a Cortex-A9 and should work on
+   all ARMv7 processors.   It has a fast past for short sizes, and has
+   an optimised path for large data sets; the worst case is finding the
+   match early in a large data set.
+
+ */
+
+#include <private/bionic_asm.h>
+
+@ 2011-02-07 david.gilbert@linaro.org
+@    Extracted from local git a5b438d861
+@ 2011-07-14 david.gilbert@linaro.org
+@    Import endianness fix from local git ea786f1b
+@ 2011-12-07 david.gilbert@linaro.org
+@    Removed unneeded cbz from align loop
+
+	.syntax unified
+	.arch armv7-a
+
+@ this lets us check a flag in a 00/ff byte easily in either endianness
+#ifdef __ARMEB__
+#define CHARTSTMASK(c) 1<<(31-(c*8))
+#else
+#define CHARTSTMASK(c) 1<<(c*8)
+#endif
+	.text
+	.thumb
+
+@ ---------------------------------------------------------------------------
+	.thumb_func
+ENTRY(memchr)
+	.p2align 4,,15
+	@ r0 = start of memory to scan
+	@ r1 = character to look for
+	@ r2 = length
+	@ returns r0 = pointer to character or NULL if not found
+	and	r1,r1,#0xff	@ Don't think we can trust the caller to actually pass a char
+
+	cmp	r2,#16		@ If it's short don't bother with anything clever
+	blt	20f
+
+	tst	r0, #7		@ If it's already aligned skip the next bit
+	beq	10f
+
+	@ Work up to an aligned point
+5:
+	ldrb	r3, [r0],#1
+	subs	r2, r2, #1
+	cmp	r3, r1
+	beq	50f		@ If it matches exit found
+	tst	r0, #7
+	bne	5b		@ If not aligned yet then do next byte
+
+10:
+	@ At this point, we are aligned, we know we have at least 8 bytes to work with
+	push	{r4,r5,r6,r7}
+	orr	r1, r1, r1, lsl #8	@ expand the match word across to all bytes
+	orr	r1, r1, r1, lsl #16
+	bic	r4, r2, #7	@ Number of double words to work with
+	mvns	r7, #0		@ all F's
+	movs	r3, #0
+
+15:
+	ldrd    r5,r6,[r0],#8
+	subs	r4, r4, #8
+	eor	r5,r5, r1	@ Get it so that r5,r6 have 00's where the bytes match the target
+	eor	r6,r6, r1
+	uadd8	r5, r5, r7	@ Parallel add 0xff - sets the GE bits for anything that wasn't 0
+	sel	r5, r3, r7	@ bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
+	uadd8	r6, r6, r7	@ Parallel add 0xff - sets the GE bits for anything that wasn't 0
+	sel	r6, r5, r7	@ chained....bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
+	cbnz	r6, 60f
+	bne	15b		@ (Flags from the subs above) If not run out of bytes then go around again
+
+	pop	{r4,r5,r6,r7}
+	and	r1,r1,#0xff	@ Get r1 back to a single character from the expansion above
+	and	r2,r2,#7	@ Leave the count remaining as the number after the double words have been done
+
+20:
+	cbz	r2, 40f		@ 0 length or hit the end already then not found
+
+21:  @ Post aligned section, or just a short call
+	ldrb	r3,[r0],#1
+	subs	r2,r2,#1
+	eor	r3,r3,r1	@ r3 = 0 if match - doesn't break flags from sub
+	cbz	r3, 50f
+	bne	21b		@ on r2 flags
+
+40:
+	movs	r0,#0		@ not found
+	bx	lr
+
+50:
+	subs	r0,r0,#1	@ found
+	bx	lr
+
+60:  @ We're here because the fast path found a hit - now we have to track down exactly which word it was
+	@ r0 points to the start of the double word after the one that was tested
+	@ r5 has the 00/ff pattern for the first word, r6 has the chained value
+	cmp	r5, #0
+	itte	eq
+	moveq	r5, r6		@ the end is in the 2nd word
+	subeq	r0,r0,#3	@ Points to 2nd byte of 2nd word
+	subne	r0,r0,#7	@ or 2nd byte of 1st word
+
+	@ r0 currently points to the 3rd byte of the word containing the hit
+	tst	r5, # CHARTSTMASK(0)	@ 1st character
+	bne	61f
+	adds	r0,r0,#1
+	tst	r5, # CHARTSTMASK(1)	@ 2nd character
+	ittt	eq
+	addeq	r0,r0,#1
+	tsteq	r5, # (3<<15)		@ 2nd & 3rd character
+	@ If not the 3rd must be the last one
+	addeq	r0,r0,#1
+
+61:
+	pop	{r4,r5,r6,r7}
+	subs	r0,r0,#1
+	bx	lr
+END(memchr)
diff --git a/libc/arch-arm/scorpion/scorpion.mk b/libc/arch-arm/scorpion/scorpion.mk
new file mode 100644
index 0000000..f3c390e
--- /dev/null
+++ b/libc/arch-arm/scorpion/scorpion.mk
@@ -0,0 +1,29 @@
+libc_openbsd_src_files_exclude_arm += \
+    upstream-openbsd/lib/libc/string/memmove.c \
+    upstream-openbsd/lib/libc/string/stpcpy.c \
+    upstream-openbsd/lib/libc/string/strcat.c \
+
+libc_bionic_src_files_exclude_arm += \
+    arch-arm/generic/bionic/memcpy.S \
+    arch-arm/generic/bionic/memset.S \
+    arch-arm/generic/bionic/strcmp.S \
+    arch-arm/generic/bionic/strcpy.S \
+    arch-arm/generic/bionic/strlen.c \
+    bionic/__strcat_chk.cpp \
+    bionic/__strcpy_chk.cpp \
+
+libc_bionic_src_files_arm += \
+    arch-arm/krait/bionic/memset.S \
+    arch-arm/krait/bionic/strcmp.S \
+
+libc_bionic_src_files_arm += \
+    arch-arm/cortex-a15/bionic/memcpy.S \
+    arch-arm/cortex-a15/bionic/stpcpy.S \
+    arch-arm/cortex-a15/bionic/strcat.S \
+    arch-arm/cortex-a15/bionic/__strcat_chk.S \
+    arch-arm/cortex-a15/bionic/strcpy.S \
+    arch-arm/cortex-a15/bionic/__strcpy_chk.S \
+    arch-arm/cortex-a15/bionic/strlen.S \
+
+libc_bionic_src_files_arm += \
+    arch-arm/denver/bionic/memmove.S
diff --git a/libc/arch-arm64/arm64.mk b/libc/arch-arm64/arm64.mk
index 9a76072..2da806c 100644
--- a/libc/arch-arm64/arm64.mk
+++ b/libc/arch-arm64/arm64.mk
@@ -3,7 +3,6 @@
 #
 # Generic arm64 optimizations, may be overriden by CPU variants.
 #
-
 libc_bionic_src_files_arm64 += \
     arch-arm64/generic/bionic/memchr.S \
     arch-arm64/generic/bionic/memcmp.S \
@@ -17,12 +16,14 @@
     arch-arm64/generic/bionic/strlen.S \
     arch-arm64/generic/bionic/strncmp.S \
     arch-arm64/generic/bionic/strnlen.S \
+    arch-arm64/generic/bionic/strrchr.S \
     arch-arm64/generic/bionic/wmemmove.S \
 
 libc_bionic_src_files_exclude_arm64 += \
     bionic/__memcpy_chk.cpp \
     bionic/strchr.cpp \
     bionic/strnlen.c \
+    bionic/strrchr.cpp \
 
 libc_freebsd_src_files_exclude_arm64 += \
     upstream-freebsd/lib/libc/string/wmemmove.c \
@@ -34,6 +35,7 @@
     upstream-openbsd/lib/libc/string/strcpy.c \
     upstream-openbsd/lib/libc/string/strncmp.c \
 
+
 #
 # Inherently architecture-specific code.
 #
diff --git a/libc/arch-arm64/generic/bionic/memcpy_base.S b/libc/arch-arm64/generic/bionic/memcpy_base.S
index c5d42ce..f850624 100644
--- a/libc/arch-arm64/generic/bionic/memcpy_base.S
+++ b/libc/arch-arm64/generic/bionic/memcpy_base.S
@@ -1,4 +1,4 @@
-/* Copyright (c) 2012, Linaro Limited
+/* Copyright (c) 2012-2013, Linaro Limited
    All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
@@ -22,158 +22,196 @@
    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+/*
+ * Copyright (c) 2015 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64
- * Unaligned accesses
+ * ARMv8-a, AArch64, unaligned accesses.
  *
  */
 
+#include <private/bionic_asm.h>
+
 #define dstin	x0
 #define src	x1
 #define count	x2
-#define tmp1	x3
-#define tmp1w	w3
-#define tmp2	x4
-#define tmp2w	w4
-#define tmp3	x5
-#define tmp3w	w5
-#define dst	x6
+#define dst	x3
+#define srcend	x4
+#define dstend	x5
+#define A_l	x6
+#define A_lw	w6
+#define A_h	x7
+#define A_hw	w7
+#define B_l	x8
+#define B_lw   w8
+#define B_h	x9
+#define C_l	x10
+#define C_h	x11
+#define D_l	x12
+#define D_h	x13
+#define E_l	src
+#define E_h	count
+#define F_l	srcend
+#define F_h	dst
+#define tmp1	x9
 
-#define A_l	x7
-#define A_h	x8
-#define B_l	x9
-#define B_h	x10
-#define C_l	x11
-#define C_h	x12
-#define D_l	x13
-#define D_h	x14
+#define L(l) .L ## l
 
-	mov	dst, dstin
-	cmp	count, #64
-	b.ge	.Lcpy_not_short
-	cmp	count, #15
-	b.le	.Ltail15tiny
+/* Copies are split into 3 main cases: small copies of up to 16 bytes,
+   medium copies of 17..96 bytes which are fully unrolled. Large copies
+   of more than 96 bytes align the destination and use an unrolled loop
+   processing 64 bytes per iteration.
+   Small and medium copies read all data before writing, allowing any
+   kind of overlap, and memmove tailcalls memcpy for these cases as
+   well as non-overlapping copies.
+*/
 
-	/* Deal with small copies quickly by dropping straight into the
-	 * exit block.  */
-.Ltail63:
-	/* Copy up to 48 bytes of data.  At this point we only need the
-	 * bottom 6 bits of count to be accurate.  */
-	ands	tmp1, count, #0x30
-	b.eq	.Ltail15
-	add	dst, dst, tmp1
-	add	src, src, tmp1
-	cmp	tmp1w, #0x20
-	b.eq	1f
-	b.lt	2f
-	ldp	A_l, A_h, [src, #-48]
-	stp	A_l, A_h, [dst, #-48]
-1:
-	ldp	A_l, A_h, [src, #-32]
-	stp	A_l, A_h, [dst, #-32]
-2:
-	ldp	A_l, A_h, [src, #-16]
-	stp	A_l, A_h, [dst, #-16]
+	prfm    PLDL1KEEP, [src]
+	add	srcend, src, count
+	add	dstend, dstin, count
+        cmp     count, 16
+        b.ls    L(copy16)
+	cmp	count, 96
+	b.hi	L(copy_long)
 
-.Ltail15:
-	ands	count, count, #15
-	beq	1f
-	add	src, src, count
-	ldp	A_l, A_h, [src, #-16]
-	add	dst, dst, count
-	stp	A_l, A_h, [dst, #-16]
-1:
-	ret
-
-.Ltail15tiny:
-	/* Copy up to 15 bytes of data.  Does not assume additional data
-	   being copied.  */
-	tbz	count, #3, 1f
-	ldr	tmp1, [src], #8
-	str	tmp1, [dst], #8
-1:
-	tbz	count, #2, 1f
-	ldr	tmp1w, [src], #4
-	str	tmp1w, [dst], #4
-1:
-	tbz	count, #1, 1f
-	ldrh	tmp1w, [src], #2
-	strh	tmp1w, [dst], #2
-1:
-	tbz	count, #0, 1f
-	ldrb	tmp1w, [src]
-	strb	tmp1w, [dst]
-1:
-	ret
-
-.Lcpy_not_short:
-	/* We don't much care about the alignment of DST, but we want SRC
-	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
-	 * boundaries on both loads and stores.  */
-	neg	tmp2, src
-	ands	tmp2, tmp2, #15		/* Bytes to reach alignment.  */
-	b.eq	2f
-	sub	count, count, tmp2
-	/* Copy more data than needed; it's faster than jumping
-	 * around copying sub-Quadword quantities.  We know that
-	 * it can't overrun.  */
+	/* Medium copies: 17..96 bytes.  */
+	sub	tmp1, count, 1
 	ldp	A_l, A_h, [src]
-	add	src, src, tmp2
-	stp	A_l, A_h, [dst]
-	add	dst, dst, tmp2
-	/* There may be less than 63 bytes to go now.  */
-	cmp	count, #63
-	b.le	.Ltail63
-2:
-	subs	count, count, #128
-	b.ge	.Lcpy_body_large
-	/* Less than 128 bytes to copy, so handle 64 here and then jump
-	 * to the tail.  */
-	ldp	A_l, A_h, [src]
-	ldp	B_l, B_h, [src, #16]
-	ldp	C_l, C_h, [src, #32]
-	ldp	D_l, D_h, [src, #48]
-	stp	A_l, A_h, [dst]
-	stp	B_l, B_h, [dst, #16]
-	stp	C_l, C_h, [dst, #32]
-	stp	D_l, D_h, [dst, #48]
-	tst	count, #0x3f
-	add	src, src, #64
-	add	dst, dst, #64
-	b.ne	.Ltail63
+	tbnz	tmp1, 6, L(copy96)
+	ldp	D_l, D_h, [srcend, -16]
+	tbz	tmp1, 5, 1f
+	ldp	B_l, B_h, [src, 16]
+	ldp	C_l, C_h, [srcend, -32]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstend, -32]
+1:
+	stp	A_l, A_h, [dstin]
+	stp	D_l, D_h, [dstend, -16]
 	ret
 
-	/* Critical loop.  Start at a new cache line boundary.  Assuming
-	 * 64 bytes per line this ensures the entire loop is in one line.  */
-	.p2align 6
-.Lcpy_body_large:
-	/* There are at least 128 bytes to copy.  */
-	ldp	A_l, A_h, [src, #0]
-	sub	dst, dst, #16		/* Pre-bias.  */
-	ldp	B_l, B_h, [src, #16]
-	ldp	C_l, C_h, [src, #32]
-	ldp	D_l, D_h, [src, #48]!	/* src += 64 - Pre-bias.  */
+	.p2align 4
+
+	/* Small copies: 0..16 bytes.  */
+L(copy16):
+	cmp	count, 8
+	b.lo	1f
+	ldr	A_l, [src]
+	ldr	A_h, [srcend, -8]
+	str	A_l, [dstin]
+	str	A_h, [dstend, -8]
+	ret
+	.p2align 4
 1:
-	stp	A_l, A_h, [dst, #16]
-	ldp	A_l, A_h, [src, #16]
-	stp	B_l, B_h, [dst, #32]
-	ldp	B_l, B_h, [src, #32]
-	stp	C_l, C_h, [dst, #48]
-	ldp	C_l, C_h, [src, #48]
-	stp	D_l, D_h, [dst, #64]!
-	ldp	D_l, D_h, [src, #64]!
-	subs	count, count, #64
-	b.ge	1b
-	stp	A_l, A_h, [dst, #16]
-	stp	B_l, B_h, [dst, #32]
-	stp	C_l, C_h, [dst, #48]
-	stp	D_l, D_h, [dst, #64]
-	add	src, src, #16
-	add	dst, dst, #64 + 16
-	tst	count, #0x3f
-	b.ne	.Ltail63
+	tbz	count, 2, 1f
+	ldr	A_lw, [src]
+	ldr	A_hw, [srcend, -4]
+	str	A_lw, [dstin]
+	str	A_hw, [dstend, -4]
+	ret
+
+	/* Copy 0..3 bytes.  Use a branchless sequence that copies the same
+	   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
+1:
+	cbz	count, 2f
+	lsr	tmp1, count, 1
+	ldrb	A_lw, [src]
+	ldrb	A_hw, [srcend, -1]
+	ldrb	B_lw, [src, tmp1]
+	strb	A_lw, [dstin]
+	strb	B_lw, [dstin, tmp1]
+	strb	A_hw, [dstend, -1]
+2:	ret
+
+	.p2align 4
+	/* Copy 64..96 bytes.  Copy 64 bytes from the start and
+	   32 bytes from the end.  */
+L(copy96):
+	ldp	B_l, B_h, [src, 16]
+	ldp	C_l, C_h, [src, 32]
+	ldp	D_l, D_h, [src, 48]
+	ldp	E_l, E_h, [srcend, -32]
+	ldp	F_l, F_h, [srcend, -16]
+	stp	A_l, A_h, [dstin]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstin, 32]
+	stp	D_l, D_h, [dstin, 48]
+	stp	E_l, E_h, [dstend, -32]
+	stp	F_l, F_h, [dstend, -16]
+	ret
+
+	/* Align DST to 16 byte alignment so that we don't cross cache line
+	   boundaries on both loads and stores.	 There are at least 96 bytes
+	   to copy, so copy 16 bytes unaligned and then align.	The loop
+	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
+
+	.p2align 4
+L(copy_long):
+	and	tmp1, dstin, 15
+	bic	dst, dstin, 15
+	ldp	D_l, D_h, [src]
+	sub	src, src, tmp1
+	add	count, count, tmp1	/* Count is now 16 too large.  */
+	ldp	A_l, A_h, [src, 16]
+	stp	D_l, D_h, [dstin]
+	ldp	B_l, B_h, [src, 32]
+	ldp	C_l, C_h, [src, 48]
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 128 + 16	/* Test and readjust count.  */
+	b.ls	2f
+1:
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [src, 16]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [src, 32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [src, 48]
+	stp	D_l, D_h, [dst, 64]!
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 64
+	b.hi	1b
+
+	/* Write the last full set of 64 bytes.	 The remainder is at most 64
+	   bytes, so it is safe to always copy 64 bytes from the end even if
+	   there is just 1 byte left.  */
+2:
+	ldp	E_l, E_h, [srcend, -64]
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [srcend, -48]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [srcend, -32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [srcend, -16]
+	stp	D_l, D_h, [dst, 64]
+	stp	E_l, E_h, [dstend, -64]
+	stp	A_l, A_h, [dstend, -48]
+	stp	B_l, B_h, [dstend, -32]
+	stp	C_l, C_h, [dstend, -16]
 	ret
diff --git a/libc/arch-arm64/generic/bionic/memset.S b/libc/arch-arm64/generic/bionic/memset.S
index 7c204b4..4b3b17b 100644
--- a/libc/arch-arm64/generic/bionic/memset.S
+++ b/libc/arch-arm64/generic/bionic/memset.S
@@ -1,4 +1,4 @@
-/* Copyright (c) 2012, Linaro Limited
+/* Copyright (c) 2012-2013, Linaro Limited
    All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
@@ -22,226 +22,207 @@
    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+/*
+ * Copyright (c) 2015 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64
- * Unaligned accesses
+ * ARMv8-a, AArch64, unaligned accesses
  *
  */
 
 #include <private/bionic_asm.h>
 
-/* By default we assume that the DC instruction can be used to zero
-   data blocks more efficiently.  In some circumstances this might be
-   unsafe, for example in an asymmetric multiprocessor environment with
-   different DC clear lengths (neither the upper nor lower lengths are
-   safe to use).
+#define dstin	x0
+#define val	x1
+#define valw	w1
+#define count	x2
+#define dst	x3
+#define dstend	x4
+#define tmp1	x5
+#define tmp1w	w5
+#define tmp2	x6
+#define tmp2w	w6
+#define zva_len x7
+#define zva_lenw w7
 
-   If code may be run in a virtualized environment, then define
-   MAYBE_VIRT.  This will cause the code to cache the system register
-   values rather than re-reading them each call.  */
+#define L(l) .L ## l
 
-#define dstin		x0
-#ifdef BZERO
-#define count		x1
-#else
-#define count		x2
-#endif
-#define val		w1
-#define tmp1		x3
-#define tmp1w		w3
-#define tmp2		x4
-#define tmp2w		w4
-#define zva_len_x	x5
-#define zva_len		w5
-#define zva_bits_x	x6
-
-#define A_l		x7
-#define A_lw		w7
-#define dst		x8
-#define tmp3w		w9
-
-#ifdef BZERO
-ENTRY(bzero)
-#else
 ENTRY(memset)
-#endif
 
-	mov	dst, dstin		/* Preserve return value.  */
-#ifdef BZERO
-	b	.Lzero_mem
-#endif
-	ands	A_lw, val, #255
-	b.eq	.Lzero_mem
-	orr	A_lw, A_lw, A_lw, lsl #8
-	orr	A_lw, A_lw, A_lw, lsl #16
-	orr	A_l, A_l, A_l, lsl #32
-.Ltail_maybe_long:
-	cmp	count, #64
-	b.ge	.Lnot_short
-.Ltail_maybe_tiny:
-	cmp	count, #15
-	b.le	.Ltail15tiny
-.Ltail63:
-	ands	tmp1, count, #0x30
-	b.eq	.Ltail15
-	add	dst, dst, tmp1
-	cmp	tmp1w, #0x20
-	b.eq	1f
-	b.lt	2f
-	stp	A_l, A_l, [dst, #-48]
-1:
-	stp	A_l, A_l, [dst, #-32]
-2:
-	stp	A_l, A_l, [dst, #-16]
+	dup	v0.16B, valw
+	add	dstend, dstin, count
 
-.Ltail15:
-	and	count, count, #15
-	add	dst, dst, count
-	stp	A_l, A_l, [dst, #-16]	/* Repeat some/all of last store. */
+	cmp	count, 96
+	b.hi	L(set_long)
+	cmp	count, 16
+	b.hs	L(set_medium)
+	mov	val, v0.D[0]
+
+	/* Set 0..15 bytes.  */
+	tbz	count, 3, 1f
+	str	val, [dstin]
+	str	val, [dstend, -8]
+	ret
+	nop
+1:	tbz	count, 2, 2f
+	str	valw, [dstin]
+	str	valw, [dstend, -4]
+	ret
+2:	cbz	count, 3f
+	strb	valw, [dstin]
+	tbz	count, 1, 3f
+	strh	valw, [dstend, -2]
+3:	ret
+
+	/* Set 17..96 bytes.  */
+L(set_medium):
+	str	q0, [dstin]
+	tbnz	count, 6, L(set96)
+	str	q0, [dstend, -16]
+	tbz	count, 5, 1f
+	str	q0, [dstin, 16]
+	str	q0, [dstend, -32]
+1:	ret
+
+	.p2align 4
+	/* Set 64..96 bytes.  Write 64 bytes from the start and
+	   32 bytes from the end.  */
+L(set96):
+	str	q0, [dstin, 16]
+	stp	q0, q0, [dstin, 32]
+	stp	q0, q0, [dstend, -32]
 	ret
 
-.Ltail15tiny:
-	/* Set up to 15 bytes.  Does not assume earlier memory
-	   being set.  */
-	tbz	count, #3, 1f
-	str	A_l, [dst], #8
-1:
-	tbz	count, #2, 1f
-	str	A_lw, [dst], #4
-1:
-	tbz	count, #1, 1f
-	strh	A_lw, [dst], #2
-1:
-	tbz	count, #0, 1f
-	strb	A_lw, [dst]
-1:
+	.p2align 3
+	nop
+L(set_long):
+	and	valw, valw, 255
+	bic	dst, dstin, 15
+	str	q0, [dstin]
+	cmp	count, 256
+	ccmp	valw, 0, 0, cs
+	b.eq	L(try_zva)
+L(no_zva):
+	sub	count, dstend, dst	/* Count is 16 too large.  */
+	add	dst, dst, 16
+	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
+1:	stp	q0, q0, [dst], 64
+	stp	q0, q0, [dst, -32]
+L(tail64):
+	subs	count, count, 64
+	b.hi	1b
+2:	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
 	ret
 
-	/* Critical loop.  Start at a new cache line boundary.  Assuming
-	 * 64 bytes per line, this ensures the entire loop is in one line.  */
-	.p2align 6
-.Lnot_short:
-	neg	tmp2, dst
-	ands	tmp2, tmp2, #15
-	b.eq	2f
-	/* Bring DST to 128-bit (16-byte) alignment.  We know that there's
-	 * more than that to set, so we simply store 16 bytes and advance by
-	 * the amount required to reach alignment.  */
-	sub	count, count, tmp2
-	stp	A_l, A_l, [dst]
-	add	dst, dst, tmp2
-	/* There may be less than 63 bytes to go now.  */
-	cmp	count, #63
-	b.le	.Ltail63
-2:
-	sub	dst, dst, #16		/* Pre-bias.  */
-	sub	count, count, #64
-1:
-	stp	A_l, A_l, [dst, #16]
-	stp	A_l, A_l, [dst, #32]
-	stp	A_l, A_l, [dst, #48]
-	stp	A_l, A_l, [dst, #64]!
-	subs	count, count, #64
-	b.ge	1b
-	tst	count, #0x3f
-	add	dst, dst, #16
-	b.ne	.Ltail63
-	ret
-
-	/* For zeroing memory, check to see if we can use the ZVA feature to
-	 * zero entire 'cache' lines.  */
-.Lzero_mem:
-	mov	A_l, #0
-	cmp	count, #63
-	b.le	.Ltail_maybe_tiny
-	neg	tmp2, dst
-	ands	tmp2, tmp2, #15
-	b.eq	1f
-	sub	count, count, tmp2
-	stp	A_l, A_l, [dst]
-	add	dst, dst, tmp2
-	cmp	count, #63
-	b.le	.Ltail63
-1:
-	/* For zeroing small amounts of memory, it's not worth setting up
-	 * the line-clear code.  */
-	cmp	count, #128
-	b.lt	.Lnot_short
-#ifdef MAYBE_VIRT
-	/* For efficiency when virtualized, we cache the ZVA capability.  */
-	adrp	tmp2, .Lcache_clear
-	ldr	zva_len, [tmp2, #:lo12:.Lcache_clear]
-	tbnz	zva_len, #31, .Lnot_short
-	cbnz	zva_len, .Lzero_by_line
+	.p2align 3
+L(try_zva):
 	mrs	tmp1, dczid_el0
-	tbz	tmp1, #4, 1f
-	/* ZVA not available.  Remember this for next time.  */
-	mov	zva_len, #~0
-	str	zva_len, [tmp2, #:lo12:.Lcache_clear]
-	b	.Lnot_short
-1:
-	mov	tmp3w, #4
-	and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */
-	lsl	zva_len, tmp3w, zva_len
-	str	zva_len, [tmp2, #:lo12:.Lcache_clear]
-#else
-	mrs	tmp1, dczid_el0
-	tbnz	tmp1, #4, .Lnot_short
-	mov	tmp3w, #4
-	and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */
-	lsl	zva_len, tmp3w, zva_len
-#endif
+	tbnz	tmp1w, 4, L(no_zva)
+	and	tmp1w, tmp1w, 15
+	cmp	tmp1w, 4	/* ZVA size is 64 bytes.  */
+	b.ne	 L(zva_128)
 
-.Lzero_by_line:
-	/* Compute how far we need to go to become suitably aligned.  We're
-	 * already at quad-word alignment.  */
-	cmp	count, zva_len_x
-	b.lt	.Lnot_short		/* Not enough to reach alignment.  */
-	sub	zva_bits_x, zva_len_x, #1
-	neg	tmp2, dst
-	ands	tmp2, tmp2, zva_bits_x
-	b.eq	1f			/* Already aligned.  */
-	/* Not aligned, check that there's enough to copy after alignment.  */
-	sub	tmp1, count, tmp2
-	cmp	tmp1, #64
-	ccmp	tmp1, zva_len_x, #8, ge	/* NZCV=0b1000 */
-	b.lt	.Lnot_short
-	/* We know that there's at least 64 bytes to zero and that it's safe
-	 * to overrun by 64 bytes.  */
-	mov	count, tmp1
-2:
-	stp	A_l, A_l, [dst]
-	stp	A_l, A_l, [dst, #16]
-	stp	A_l, A_l, [dst, #32]
-	subs	tmp2, tmp2, #64
-	stp	A_l, A_l, [dst, #48]
-	add	dst, dst, #64
-	b.ge	2b
-	/* We've overrun a bit, so adjust dst downwards.  */
-	add	dst, dst, tmp2
-1:
-	sub	count, count, zva_len_x
-3:
-	dc	zva, dst
-	add	dst, dst, zva_len_x
-	subs	count, count, zva_len_x
-	b.ge	3b
-	ands	count, count, zva_bits_x
-	b.ne	.Ltail_maybe_long
+	/* Write the first and last 64 byte aligned block using stp rather
+	   than using DC ZVA.  This is faster on some cores.
+	 */
+L(zva_64):
+	str	q0, [dst, 16]
+	stp	q0, q0, [dst, 32]
+	bic	dst, dst, 63
+	stp	q0, q0, [dst, 64]
+	stp	q0, q0, [dst, 96]
+	sub	count, dstend, dst	/* Count is now 128 too large.	*/
+	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
+	add	dst, dst, 128
+	nop
+1:	dc	zva, dst
+	add	dst, dst, 64
+	subs	count, count, 64
+	b.hi	1b
+	stp	q0, q0, [dst, 0]
+	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
 	ret
-#ifdef BZERO
-END(bzero)
-#else
+
+	.p2align 3
+L(zva_128):
+	cmp	tmp1w, 5	/* ZVA size is 128 bytes.  */
+	b.ne	L(zva_other)
+
+	str	q0, [dst, 16]
+	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dst, 64]
+	stp	q0, q0, [dst, 96]
+	bic	dst, dst, 127
+	sub	count, dstend, dst	/* Count is now 128 too large.	*/
+	sub	count, count, 128+128	/* Adjust count and bias for loop.  */
+	add	dst, dst, 128
+1:	dc	zva, dst
+	add	dst, dst, 128
+	subs	count, count, 128
+	b.hi	1b
+	stp	q0, q0, [dstend, -128]
+	stp	q0, q0, [dstend, -96]
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+
+L(zva_other):
+	mov	tmp2w, 4
+	lsl	zva_lenw, tmp2w, tmp1w
+	add	tmp1, zva_len, 64	/* Max alignment bytes written.	 */
+	cmp	count, tmp1
+	blo	L(no_zva)
+
+	sub	tmp2, zva_len, 1
+	add	tmp1, dst, zva_len
+	add	dst, dst, 16
+	subs	count, tmp1, dst	/* Actual alignment bytes to write.  */
+	bic	tmp1, tmp1, tmp2	/* Aligned dc zva start address.  */
+	beq	2f
+1:	stp	q0, q0, [dst], 64
+	stp	q0, q0, [dst, -32]
+	subs	count, count, 64
+	b.hi	1b
+2:	mov	dst, tmp1
+	sub	count, dstend, tmp1	/* Remaining bytes to write.  */
+	subs	count, count, zva_len
+	b.lo	4f
+3:	dc	zva, dst
+	add	dst, dst, zva_len
+	subs	count, count, zva_len
+	b.hs	3b
+4:	add	count, count, zva_len
+	b	L(tail64)
+
 END(memset)
-#endif
-
-#ifdef MAYBE_VIRT
-	.bss
-	.p2align 2
-.Lcache_clear:
-	.space 4
-#endif
diff --git a/libc/arch-arm64/generic/bionic/strlen.S b/libc/arch-arm64/generic/bionic/strlen.S
index 3bd9809..6e540fc 100644
--- a/libc/arch-arm64/generic/bionic/strlen.S
+++ b/libc/arch-arm64/generic/bionic/strlen.S
@@ -1,16 +1,16 @@
-/* Copyright (c) 2014, Linaro Limited
+/* Copyright (c) 2013-2015, Linaro Limited
    All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are met:
        * Redistributions of source code must retain the above copyright
-         notice, this list of conditions and the following disclaimer.
+	 notice, this list of conditions and the following disclaimer.
        * Redistributions in binary form must reproduce the above copyright
-         notice, this list of conditions and the following disclaimer in the
-         documentation and/or other materials provided with the distribution.
+	 notice, this list of conditions and the following disclaimer in the
+	 documentation and/or other materials provided with the distribution.
        * Neither the name of the Linaro nor the
-         names of its contributors may be used to endorse or promote products
-         derived from this software without specific prior written permission.
+	 names of its contributors may be used to endorse or promote products
+	 derived from this software without specific prior written permission.
 
    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
@@ -22,16 +22,19 @@
    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64
+ * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
  */
 
 #include <private/bionic_asm.h>
 
+/* To test the page crossing code path more thoroughly, compile with
+   -DTEST_PAGE_CROSS - this will force all calls through the slower
+   entry path.  This option is not intended for production use.	 */
+
 /* Arguments and results.  */
 #define srcin		x0
 #define len		x0
@@ -40,87 +43,185 @@
 #define src		x1
 #define data1		x2
 #define data2		x3
-#define data2a		x4
-#define has_nul1	x5
-#define has_nul2	x6
-#define tmp1		x7
-#define tmp2		x8
-#define tmp3		x9
-#define tmp4		x10
-#define zeroones	x11
-#define pos		x12
+#define has_nul1	x4
+#define has_nul2	x5
+#define tmp1		x4
+#define tmp2		x5
+#define tmp3		x6
+#define tmp4		x7
+#define zeroones	x8
+
+#define L(l) .L ## l
+
+	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+	   can be done in parallel across the entire word. A faster check
+	   (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
+	   false hits for characters 129..255.	*/
 
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
 #define REP8_80 0x8080808080808080
 
-	/* Start of critial section -- keep to one 64Byte cache line.  */
-ENTRY(strlen)
-	mov	zeroones, #REP8_01
-	bic	src, srcin, #15
-	ands	tmp1, srcin, #15
-	b.ne	.Lmisaligned
-	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
-	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-	   can be done in parallel across the entire word.  */
-	/* The inner loop deals with two Dwords at a time.  This has a
-	   slightly higher start-up cost, but we should win quite quickly,
-	   especially on cores with a high number of issue slots per
-	   cycle, as we get much better parallelism out of the operations.  */
-.Lloop:
-	ldp	data1, data2, [src], #16
-.Lrealigned:
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, #REP8_7f
-	bic	has_nul1, tmp1, tmp2
-	bics	has_nul2, tmp3, tmp4
-	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
-	b.eq	.Lloop
-	/* End of critical section -- keep to one 64Byte cache line.  */
-
-	sub	len, src, srcin
-	cbz	has_nul1, .Lnul_in_data2
-#ifdef __AARCH64EB__
-	mov	data2, data1
+#ifdef TEST_PAGE_CROSS
+# define MIN_PAGE_SIZE 15
+#else
+# define MIN_PAGE_SIZE 4096
 #endif
-	sub	len, len, #8
-	mov	has_nul2, has_nul1
-.Lnul_in_data2:
+
+	/* Since strings are short on average, we check the first 16 bytes
+	   of the string for a NUL character.  In order to do an unaligned ldp
+	   safely we have to do a page cross check first.  If there is a NUL
+	   byte we calculate the length from the 2 8-byte words using
+	   conditional select to reduce branch mispredictions (it is unlikely
+	   strlen will be repeatedly called on strings with the same length).
+
+	   If the string is longer than 16 bytes, we align src so don't need
+	   further page cross checks, and process 32 bytes per iteration
+	   using the fast NUL check.  If we encounter non-ASCII characters,
+	   fallback to a second loop using the full NUL check.
+
+	   If the page cross check fails, we read 16 bytes from an aligned
+	   address, remove any characters before the string, and continue
+	   in the main loop using aligned loads.  Since strings crossing a
+	   page in the first 16 bytes are rare (probability of
+	   16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
+
+	   AArch64 systems have a minimum page size of 4k.  We don't bother
+	   checking for larger page sizes - the cost of setting up the correct
+	   page size is just not worth the extra gain from a small reduction in
+	   the cases taking the slow path.  Note that we only care about
+	   whether the first fetch, which may be misaligned, crosses a page
+	   boundary.  */
+
+ENTRY(strlen)
+	and	tmp1, srcin, MIN_PAGE_SIZE - 1
+	mov	zeroones, REP8_01
+	cmp	tmp1, MIN_PAGE_SIZE - 16
+	b.gt	L(page_cross)
+	ldp	data1, data2, [srcin]
 #ifdef __AARCH64EB__
 	/* For big-endian, carry propagation (if the final byte in the
-	   string is 0x01) means we cannot use has_nul directly.  The
-	   easiest way to get the correct byte is to byte-swap the data
-	   and calculate the syndrome a second time.  */
+	   string is 0x01) means we cannot use has_nul1/2 directly.
+	   Since we expect strings to be small and early-exit,
+	   byte-swap the data now so has_null1/2 will be correct.  */
+	rev	data1, data1
 	rev	data2, data2
-	sub	tmp1, data2, zeroones
-	orr	tmp2, data2, #REP8_7f
-	bic	has_nul2, tmp1, tmp2
 #endif
-	sub	len, len, #8
-	rev	has_nul2, has_nul2
-	clz	pos, has_nul2
-	add	len, len, pos, lsr #3		/* Bits to bytes.  */
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	beq	L(main_loop_entry)
+
+	/* Enter with C = has_nul1 == 0.  */
+	csel	has_nul1, has_nul1, has_nul2, cc
+	mov	len, 8
+	rev	has_nul1, has_nul1
+	clz	tmp1, has_nul1
+	csel	len, xzr, len, cc
+	add	len, len, tmp1, lsr 3
 	ret
 
-.Lmisaligned:
-	cmp	tmp1, #8
-	neg	tmp1, tmp1
-	ldp	data1, data2, [src], #16
-	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
-	mov	tmp2, #~0
+	/* The inner loop processes 32 bytes per iteration and uses the fast
+	   NUL check.  If we encounter non-ASCII characters, use a second
+	   loop with the accurate NUL check.  */
+	.p2align 4
+L(main_loop_entry):
+	bic	src, srcin, 15
+	sub	src, src, 16
+L(main_loop):
+	ldp	data1, data2, [src, 32]!
+.Lpage_cross_entry:
+	sub	tmp1, data1, zeroones
+	sub	tmp3, data2, zeroones
+	orr	tmp2, tmp1, tmp3
+	tst	tmp2, zeroones, lsl 7
+	bne	1f
+	ldp	data1, data2, [src, 16]
+	sub	tmp1, data1, zeroones
+	sub	tmp3, data2, zeroones
+	orr	tmp2, tmp1, tmp3
+	tst	tmp2, zeroones, lsl 7
+	beq	L(main_loop)
+	add	src, src, 16
+1:
+	/* The fast check failed, so do the slower, accurate NUL check.	 */
+	orr	tmp2, data1, REP8_7f
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	beq	L(nonascii_loop)
+
+	/* Enter with C = has_nul1 == 0.  */
+L(tail):
 #ifdef __AARCH64EB__
-	/* Big-endian.  Early bytes are at MSB.  */
-	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+	/* For big-endian, carry propagation (if the final byte in the
+	   string is 0x01) means we cannot use has_nul1/2 directly.  The
+	   easiest way to get the correct byte is to byte-swap the data
+	   and calculate the syndrome a second time.  */
+	csel	data1, data1, data2, cc
+	rev	data1, data1
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, REP8_7f
+	bic	has_nul1, tmp1, tmp2
+#else
+	csel	has_nul1, has_nul1, has_nul2, cc
+#endif
+	sub	len, src, srcin
+	rev	has_nul1, has_nul1
+	add	tmp2, len, 8
+	clz	tmp1, has_nul1
+	csel	len, len, tmp2, cc
+	add	len, len, tmp1, lsr 3
+	ret
+
+L(nonascii_loop):
+	ldp	data1, data2, [src, 16]!
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	bne	L(tail)
+	ldp	data1, data2, [src, 16]!
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	beq	L(nonascii_loop)
+	b	L(tail)
+
+	/* Load 16 bytes from [srcin & ~15] and force the bytes that precede
+	   srcin to 0x7f, so we ignore any NUL bytes before the string.
+	   Then continue in the aligned loop.  */
+L(page_cross):
+	bic	src, srcin, 15
+	ldp	data1, data2, [src]
+	lsl	tmp1, srcin, 3
+	mov	tmp4, -1
+#ifdef __AARCH64EB__
+	/* Big-endian.	Early bytes are at MSB.	 */
+	lsr	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
 #else
 	/* Little-endian.  Early bytes are at LSB.  */
-	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+	lsl	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
 #endif
-	orr	data1, data1, tmp2
-	orr	data2a, data2, tmp2
-	csinv	data1, data1, xzr, le
-	csel	data2, data2, data2a, le
-	b	.Lrealigned
+	orr	tmp1, tmp1, REP8_80
+	orn	data1, data1, tmp1
+	orn	tmp2, data2, tmp1
+	tst	srcin, 8
+	csel	data1, data1, tmp4, eq
+	csel	data2, data2, tmp2, eq
+	b	L(page_cross_entry)
 
 END(strlen)
diff --git a/libc/arch-arm64/generic/bionic/strrchr.S b/libc/arch-arm64/generic/bionic/strrchr.S
new file mode 100644
index 0000000..409bc71
--- /dev/null
+++ b/libc/arch-arm64/generic/bionic/strrchr.S
@@ -0,0 +1,54 @@
+/*
+   strrchr - find last instance of a character in a string
+
+   Copyright (c) 2014, ARM Limited
+   All rights Reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+       * Redistributions of source code must retain the above copyright
+         notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above copyright
+         notice, this list of conditions and the following disclaimer in the
+         documentation and/or other materials provided with the distribution.
+       * Neither the name of the company nor the names of its contributors
+         may be used to endorse or promote products derived from this
+         software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+   
+*/
+
+#include <private/bionic_asm.h>
+
+/*
+ * Find the last occurrence of a character in a string.
+ *
+ * Parameters:
+ *	x0 - str
+ *	x1 - c
+ * Returns:
+ *	x0 - address of last occurrence of 'c' or 0
+ */
+ENTRY(strrchr)
+	mov	x3, #0
+	and	w1, w1, #0xff
+1:	ldrb	w2, [x0], #1
+	cbz	w2, 2f
+	cmp	w2, w1
+	b.ne	1b
+	sub	x3, x0, #1
+	b	1b
+2:	mov	x0, x3
+	ret
+END(strrchr)
diff --git a/libc/arch-arm64/kryo/bionic/memcpy.S b/libc/arch-arm64/kryo/bionic/memcpy.S
new file mode 100644
index 0000000..87e1b3b
--- /dev/null
+++ b/libc/arch-arm64/kryo/bionic/memcpy.S
@@ -0,0 +1,65 @@
+/*
+ * Copyright (C) 2008 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+// Prototype: void *memcpy (void *dst, const void *src, size_t count).
+
+#include <private/bionic_asm.h>
+#include <private/libc_events.h>
+
+ENTRY(__memcpy_chk)
+  cmp   x2, x3
+  b.hi  __memcpy_chk_fail
+
+  // Fall through to memcpy...
+  b memcpy
+END(__memcpy_chk)
+
+        .align  6
+ENTRY(memcpy)
+  #include "memcpy_base.S"
+END(memcpy)
+
+ENTRY_PRIVATE(__memcpy_chk_fail)
+  // Preserve for accurate backtrace.
+  stp  x29, x30, [sp, -16]!
+  .cfi_def_cfa_offset 16
+  .cfi_rel_offset x29, 0
+  .cfi_rel_offset x30, 8
+
+  adrp  x0, error_string
+  add   x0, x0, :lo12:error_string
+  ldr   x1, error_code
+  bl    __fortify_chk_fail
+error_code:
+  .word   BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW
+END(__memcpy_chk_fail)
+
+  .data
+  .align 2
+error_string:
+  .string "memcpy: prevented write past end of buffer"
diff --git a/libc/arch-arm64/kryo/bionic/memcpy_base.S b/libc/arch-arm64/kryo/bionic/memcpy_base.S
new file mode 100644
index 0000000..a951afb
--- /dev/null
+++ b/libc/arch-arm64/kryo/bionic/memcpy_base.S
@@ -0,0 +1,244 @@
+/* Copyright (c) 2015 The Linux Foundation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of The Linux Foundation nor the names of its contributors may
+ *       be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef PLDOFFS
+#undef PLDOFFS
+#endif
+#define PLDOFFS		(16)
+
+#ifdef PLDTHRESH
+#undef PLDTHRESH
+#endif
+#define PLDTHRESH (PLDOFFS)
+
+#ifdef BBTHRESH
+#undef BBTHRESH
+#endif
+#define BBTHRESH (2048/128)
+
+#if (PLDOFFS < 1)
+#error Routine does not support offsets less than 1
+#endif
+#if (PLDTHRESH < PLDOFFS)
+#error PLD threshold must be greater than or equal to the PLD offset
+#endif
+
+#ifdef PLDSIZE
+#undef PLDSIZE
+#endif
+#define PLDSIZE	(128)
+
+kryo_bb_memcpy:
+	mov	x11, x0
+	cmp	x2, #4
+	blo	kryo_bb_lt4
+	cmp	x2, #16
+	blo	kryo_bb_lt16
+	cmp	x2, #32
+	blo	kryo_bb_16
+	cmp	x2, #64
+	blo	kryo_bb_copy_32_a
+	cmp	x2, #128
+	blo	kryo_bb_copy_64_a
+
+	// we have at least 127 bytes to achieve 128-byte alignment
+	neg	x3, x1			// calculate count to get SOURCE aligned
+	ands	x3, x3, #0x7F
+	b.eq	kryo_bb_source_aligned	// already aligned
+	// alignment fixup, small to large (favorable alignment)
+	tbz	x3, #0, 1f
+	ldrb	w5, [x1], #1
+	strb	w5, [x0], #1
+1:	tbz	x3, #1, 2f
+	ldrh	w6, [x1], #2
+	strh	w6, [x0], #2
+2:	tbz	x3, #2, 3f
+	ldr	w8, [x1], #4
+	str	w8, [x0], #4
+3:	tbz	x3, #3, 4f
+	ldr	x9, [x1], #8
+	str	x9, [x0], #8
+4:	tbz	x3, #4, 5f
+	ldr	q7, [x1], #16
+	str	q7, [x0], #16
+5:	tbz	x3, #5, 55f
+	ldp	q0, q1, [x1], #32
+	stp	q0, q1, [x0], #32
+55:	tbz	x3, #6, 6f
+	ldp	q0, q1, [x1], #32
+	ldp	q2, q3, [x1], #32
+	stp	q0, q1, [x0], #32
+	stp	q2, q3, [x0], #32
+6:	subs	x2, x2, x3		// fixup count after alignment
+	b.eq	kryo_bb_exit
+	cmp	x2, #128
+	blo	kryo_bb_copy_64_a
+kryo_bb_source_aligned:
+	lsr	x12, x2, #7
+	cmp	x12, #PLDTHRESH
+	bls	kryo_bb_copy_128_loop_nopld
+
+	cmp	x12, #BBTHRESH
+	bls	kryo_bb_prime_pump
+
+	add	x14, x0, #0x400
+	add	x9,  x1, #(PLDOFFS*PLDSIZE)
+	sub	x14, x14, x9
+	lsl	x14, x14, #(21+32)
+	lsr	x14, x14, #(21+32)
+	add	x14, x14, #(PLDOFFS*PLDSIZE)
+	cmp	x12, x14, lsr #7
+	bls	kryo_bb_prime_pump
+
+	mov	x9, #(PLDOFFS)
+	lsr     x13, x14, #7
+	subs    x9, x13, x9
+	bls	kryo_bb_prime_pump
+
+	add	x10, x1, x14
+	bic	x10, x10, #0x7F		// Round to multiple of PLDSIZE
+
+	sub	x12, x12, x14, lsr #7
+	cmp	x9, x12
+	sub     x13, x12, x9
+	csel    x12, x13, x12, LS
+	csel    x9, x12, x9, HI
+	csel    x12, xzr, x12, HI
+
+	prfm	PLDL1STRM, [x1, #((PLDOFFS-1)*PLDSIZE)]
+	prfm	PLDL1STRM, [x1, #((PLDOFFS-1)*PLDSIZE+64)]
+kryo_bb_copy_128_loop_outer_doublepld:
+	prfm	PLDL1STRM, [x1, #((PLDOFFS)*PLDSIZE)]
+	prfm	PLDL1STRM, [x1, #((PLDOFFS)*PLDSIZE)+64]
+	subs	x9, x9, #1
+	ldp	q0, q1, [x1], #32
+	ldp	q2, q3, [x1], #32
+	ldp	q4, q5, [x1], #32
+	ldp	q6, q7, [x1], #32
+	prfm	PLDL1KEEP, [x10]
+	prfm	PLDL1KEEP, [x10, #64]
+	add	x10, x10, #128
+	stp	q0, q1, [x0], #32
+	stp	q2, q3, [x0], #32
+	stp	q4, q5, [x0], #32
+	stp	q6, q7, [x0], #32
+	bne	kryo_bb_copy_128_loop_outer_doublepld
+	cmp	x12, #0
+	beq	kryo_bb_pop_before_nopld
+	cmp	x12, #(448*1024/128)
+	bls	kryo_bb_copy_128_loop_outer
+
+kryo_bb_copy_128_loop_ddr:
+	subs	x12, x12, #1
+	ldr	x3, [x10], #128
+	ldp	q0, q1, [x1], #32
+	ldp	q2, q3, [x1], #32
+	ldp	q4, q5, [x1], #32
+	ldp	q6, q7, [x1], #32
+	stp	q0, q1, [x0], #32
+	stp	q2, q3, [x0], #32
+	stp	q4, q5, [x0], #32
+	stp	q6, q7, [x0], #32
+	bne	kryo_bb_copy_128_loop_ddr
+	b	kryo_bb_pop_before_nopld
+
+kryo_bb_prime_pump:
+	mov	x14, #(PLDOFFS*PLDSIZE)
+	add	x10, x1, #(PLDOFFS*PLDSIZE)
+	bic	x10, x10, #0x7F
+	sub	x12, x12, #PLDOFFS
+	prfum	PLDL1KEEP, [x10, #(-1*PLDSIZE)]
+	prfum	PLDL1KEEP, [x10, #(-1*PLDSIZE+64)]
+	cmp	x12, #(448*1024/128)
+	bhi	kryo_bb_copy_128_loop_ddr
+
+kryo_bb_copy_128_loop_outer:
+	subs	x12, x12, #1
+	prfm	PLDL1KEEP, [x10]
+	prfm	PLDL1KEEP, [x10, #64]
+	ldp	q0, q1, [x1], #32
+	ldp	q2, q3, [x1], #32
+	ldp	q4, q5, [x1], #32
+	ldp	q6, q7, [x1], #32
+	add	x10, x10, #128
+	stp	q0, q1, [x0], #32
+	stp	q2, q3, [x0], #32
+	stp	q4, q5, [x0], #32
+	stp	q6, q7, [x0], #32
+	bne	kryo_bb_copy_128_loop_outer
+
+kryo_bb_pop_before_nopld:
+	lsr	x12, x14, #7
+kryo_bb_copy_128_loop_nopld:
+	ldp	q0, q1, [x1], #32
+	ldp	q2, q3, [x1], #32
+	ldp	q4, q5, [x1], #32
+	ldp	q6, q7, [x1], #32
+	subs	x12, x12, #1
+	stp	q0, q1, [x0], #32
+	stp	q2, q3, [x0], #32
+	stp	q4, q5, [x0], #32
+	stp	q6, q7, [x0], #32
+	bne	kryo_bb_copy_128_loop_nopld
+	ands	x2, x2, #0x7f
+	beq	kryo_bb_exit
+
+kryo_bb_copy_64_a:
+	tbz	x2, #6, kryo_bb_copy_32_a
+	ldp	q0, q1, [x1], #32
+	ldp	q2, q3, [x1], #32
+	stp	q0, q1, [x0], #32
+	stp	q2, q3, [x0], #32
+kryo_bb_copy_32_a:
+	tbz	x2, #5, kryo_bb_16
+	ldp	q0, q1, [x1], #32
+	stp	q0, q1, [x0], #32
+kryo_bb_16:
+	tbz	x2, #4, kryo_bb_lt16
+	ldr	q7, [x1], #16
+	str	q7, [x0], #16
+	ands	x2, x2, #0x0f
+	beq	kryo_bb_exit
+kryo_bb_lt16:
+	tbz	x2, #3, kryo_bb_lt8
+	ldr	x3, [x1], #8
+	str	x3, [x0], #8
+kryo_bb_lt8:
+	tbz	x2, #2, kryo_bb_lt4
+	ldr	w3, [x1], #4
+	str	w3, [x0], #4
+kryo_bb_lt4:
+	tbz	x2, #1, kryo_bb_lt2
+	ldrh	w3, [x1], #2
+	strh	w3, [x0], #2
+kryo_bb_lt2:
+	tbz	x2, #0, kryo_bb_exit
+	ldrb	w3, [x1], #1
+	strb	w3, [x0], #1
+kryo_bb_exit:
+	mov	x0, x11
+	ret
+
diff --git a/libc/arch-arm64/kryo/kryo.mk b/libc/arch-arm64/kryo/kryo.mk
new file mode 100644
index 0000000..7f3fe8c
--- /dev/null
+++ b/libc/arch-arm64/kryo/kryo.mk
@@ -0,0 +1,5 @@
+libc_bionic_src_files_arm64 += \
+    arch-arm64/kryo/bionic/memcpy.S \
+
+libc_bionic_src_files_exclude_arm64 += \
+    arch-arm64/generic/bionic/memcpy.S \
diff --git a/libc/bionic/libc_init_common.cpp b/libc/bionic/libc_init_common.cpp
index 4f1226d..71c0b5f 100644
--- a/libc/bionic/libc_init_common.cpp
+++ b/libc/bionic/libc_init_common.cpp
@@ -243,6 +243,7 @@
     "LD_ORIGIN_PATH",
     "LD_PRELOAD",
     "LD_PROFILE",
+    "LD_SHIM_LIBS",
     "LD_SHOW_AUXV",
     "LD_USE_LOAD_BIAS",
     "LOCALDOMAIN",
diff --git a/libc/bionic/mmap.cpp b/libc/bionic/mmap.cpp
index 57a8cdf..9919f40 100644
--- a/libc/bionic/mmap.cpp
+++ b/libc/bionic/mmap.cpp
@@ -38,6 +38,11 @@
 extern "C" void*  __mmap2(void*, size_t, int, int, int, size_t);
 
 #define MMAP2_SHIFT 12 // 2**12 == 4096
+#ifdef LEGACY_MMAP
+#define TO_64(a) ((a) & 0x00000000ffffffff)
+#else
+#define TO_64(a) (a)
+#endif
 
 static bool kernel_has_MADV_MERGEABLE = true;
 
@@ -73,5 +78,5 @@
 }
 
 void* mmap(void* addr, size_t size, int prot, int flags, int fd, off_t offset) {
-  return mmap64(addr, size, prot, flags, fd, static_cast<off64_t>(offset));
+  return mmap64(addr, size, prot, flags, fd, TO_64(static_cast<off64_t>(offset)));
 }
diff --git a/libc/dns/net/getaddrinfo.c b/libc/dns/net/getaddrinfo.c
index fd6c004..bb483b6 100644
--- a/libc/dns/net/getaddrinfo.c
+++ b/libc/dns/net/getaddrinfo.c
@@ -108,6 +108,8 @@
 #include <stdarg.h>
 #include "nsswitch.h"
 
+#include "hosts_cache.h"
+
 #ifdef ANDROID_CHANGES
 #include <sys/system_properties.h>
 #endif /* ANDROID_CHANGES */
@@ -1805,10 +1807,14 @@
 			return -1;
 		}
 	}
-	if (mark != MARK_UNSET && setsockopt(sock, SOL_SOCKET, SO_MARK, &mark, sizeof(mark)) < 0)
+	if (mark != MARK_UNSET && setsockopt(sock, SOL_SOCKET, SO_MARK, &mark, sizeof(mark)) < 0) {
+		close(sock);
 		return 0;
-	if (uid > 0 && uid != NET_CONTEXT_INVALID_UID && fchown(sock, uid, (gid_t)-1) < 0)
+	}
+	if (uid > 0 && uid != NET_CONTEXT_INVALID_UID && fchown(sock, uid, (gid_t)-1) < 0) {
+		close(sock);
 		return 0;
+	}
 	do {
 		ret = __connect(sock, addr, len);
 	} while (ret == -1 && errno == EINTR);
@@ -2117,6 +2123,14 @@
 	name = va_arg(ap, char *);
 	pai = va_arg(ap, struct addrinfo *);
 
+	memset(&sentinel, 0, sizeof(sentinel));
+	cur = &sentinel;
+	int gai_error = hc_getaddrinfo(name, NULL, pai, &cur);
+	if (gai_error != EAI_SYSTEM) {
+		*((struct addrinfo **)rv) = sentinel.ai_next;
+		return (gai_error == 0 ? NS_SUCCESS : NS_NOTFOUND);
+	}
+
 //	fprintf(stderr, "_files_getaddrinfo() name = '%s'\n", name);
 	memset(&sentinel, 0, sizeof(sentinel));
 	cur = &sentinel;
diff --git a/libc/dns/net/hosts_cache.c b/libc/dns/net/hosts_cache.c
new file mode 100644
index 0000000..deafb78
--- /dev/null
+++ b/libc/dns/net/hosts_cache.c
@@ -0,0 +1,524 @@
+/*
+ * Copyright (C) 2016 The CyanogenMod Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <fcntl.h>
+#include <netdb.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <strings.h>
+#include <sys/file.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <utime.h>
+#include <pthread.h>
+
+#include <netinet/in6.h>
+#include <arpa/inet.h>
+
+#include "hostent.h"
+#include "resolv_private.h"
+
+#define MAX_ADDRLEN	(INET6_ADDRSTRLEN - (1 + 5))
+#define MAX_HOSTLEN	MAXHOSTNAMELEN
+
+#define ESTIMATED_LINELEN	32
+#define HCFILE_ALLOC_SIZE	256
+
+/* From sethostent.c */
+#define ALIGNBYTES	(sizeof(uintptr_t) - 1)
+#define ALIGN(p)	(((uintptr_t)(p) + ALIGNBYTES) &~ ALIGNBYTES)
+
+/*
+ * Host cache entry for hcfile.c_data.
+ * Offsets are into hcfile.h_data.
+ * Strings are *not* terminated by NULL, but by whitespace (isspace) or '#'.
+ * Use hstr* functions with these.
+ */
+struct hcent
+{
+	uint32_t	addr;
+	uint32_t	name;
+};
+
+/*
+ * Overall host cache file state.
+ */
+struct hcfile
+{
+	int		h_fd;
+	struct stat	h_st;
+	char		*h_data;
+
+	uint32_t	c_alloc;
+	uint32_t	c_len;
+	struct hcent	*c_data;
+};
+static struct hcfile hcfile;
+static pthread_mutex_t hclock = PTHREAD_MUTEX_INITIALIZER;
+
+static size_t hstrlen(const char *s)
+{
+	const char *p = s;
+	while (*p && *p != '#' && !isspace(*p))
+		++p;
+	return p - s;
+}
+
+static int hstrcmp(const char *a, const char *b)
+{
+	size_t alen = hstrlen(a);
+	size_t blen = hstrlen(b);
+	int res = strncmp(a, b, MIN(alen, blen));
+	if (res == 0)
+		res = alen - blen;
+	return res;
+}
+
+static char *hstrcpy(char *dest, const char *src)
+{
+	size_t len = hstrlen(src);
+	memcpy(dest, src, len);
+	dest[len] = '\0';
+	return dest;
+}
+
+static char *hstrdup(const char *s)
+{
+	size_t len = hstrlen(s);
+	char *dest = (char *)malloc(len + 1);
+	if (!dest)
+		return NULL;
+	memcpy(dest, s, len);
+	dest[len] = '\0';
+	return dest;
+}
+
+static int cmp_hcent_name(const void *a, const void *b)
+{
+	struct hcent *ea = (struct hcent *)a;
+	const char *na = hcfile.h_data + ea->name;
+	struct hcent *eb = (struct hcent *)b;
+	const char *nb = hcfile.h_data + eb->name;
+
+	return hstrcmp(na, nb);
+}
+
+static struct hcent *_hcfindname(const char *name)
+{
+	size_t first, last, mid;
+	struct hcent *cur = NULL;
+	int cmp;
+
+	if (hcfile.c_len == 0)
+		return NULL;
+
+	first = 0;
+	last = hcfile.c_len - 1;
+	mid = (first + last) / 2;
+	while (first <= last) {
+		cur = hcfile.c_data + mid;
+		cmp = hstrcmp(hcfile.h_data + cur->name, name);
+		if (cmp == 0)
+			goto found;
+		if (cmp < 0)
+			first = mid + 1;
+		else {
+			if (mid > 0)
+				last = mid - 1;
+			else
+				return NULL;
+		}
+		mid = (first + last) / 2;
+	}
+	return NULL;
+
+found:
+	while (cur > hcfile.c_data) {
+		struct hcent *prev = cur - 1;
+		cmp = cmp_hcent_name(cur, prev);
+		if (cmp)
+			break;
+		cur = prev;
+	}
+
+	return cur;
+}
+
+/*
+ * Find next name on line, if any.
+ *
+ * Assumes that line is terminated by LF.
+ */
+static const char *_hcnextname(const char *name)
+{
+	while (!isspace(*name)) {
+		if (*name == '#')
+			return NULL;
+		++name;
+	}
+	while (isspace(*name)) {
+		if (*name == '\n')
+			return NULL;
+		++name;
+	}
+	if (*name == '#')
+		return NULL;
+	return name;
+}
+
+static int _hcfilemmap(void)
+{
+	struct stat st;
+	int h_fd;
+	char *h_addr;
+	const char *p, *pend;
+	uint32_t c_alloc;
+
+	h_fd = open(_PATH_HOSTS, O_RDONLY);
+	if (h_fd < 0)
+		return -1;
+	if (flock(h_fd, LOCK_EX) != 0) {
+		close(h_fd);
+		return -1;
+	}
+
+	if (hcfile.h_data) {
+		memset(&st, 0, sizeof(st));
+		if (fstat(h_fd, &st) == 0) {
+			if (st.st_size == hcfile.h_st.st_size &&
+			    st.st_mtime == hcfile.h_st.st_mtime) {
+				flock(h_fd, LOCK_UN);
+				close(h_fd);
+				return 0;
+			}
+		}
+		free(hcfile.c_data);
+		munmap(hcfile.h_data, hcfile.h_st.st_size);
+		close(hcfile.h_fd);
+		memset(&hcfile, 0, sizeof(struct hcfile));
+	}
+
+	if (fstat(h_fd, &st) != 0) {
+		flock(h_fd, LOCK_UN);
+		close(h_fd);
+		return -1;
+	}
+	h_addr = mmap(NULL, st.st_size, PROT_READ, MAP_SHARED, h_fd, 0);
+	if (h_addr == MAP_FAILED) {
+		flock(h_fd, LOCK_UN);
+		close(h_fd);
+		return -1;
+	}
+
+	hcfile.h_fd = h_fd;
+	hcfile.h_st = st;
+	hcfile.h_data = h_addr;
+
+	c_alloc = 0;
+	/*
+	 * Do an initial allocation if the file is "large".  Estimate
+	 * 32 bytes per line and define "large" as more than half of
+	 * the alloc growth size (256 entries).
+	 */
+	if (st.st_size >= ESTIMATED_LINELEN * HCFILE_ALLOC_SIZE / 2) {
+		c_alloc = st.st_size / ESTIMATED_LINELEN;
+		hcfile.c_data = malloc(c_alloc * sizeof(struct hcent));
+		if (!hcfile.c_data) {
+			goto oom;
+		}
+	}
+
+	p = (const char *)h_addr;
+	pend = p + st.st_size;
+	while (p < pend) {
+		const char *eol, *addr, *name;
+		size_t len;
+		addr = p;
+		eol = memchr(p, '\n', pend - p);
+		if (!eol)
+			break;
+		p = eol + 1;
+		if (*addr == '#' || *addr == '\n')
+			continue;
+		len = hstrlen(addr);
+		if (len > MAX_ADDRLEN)
+			continue;
+		name = addr + len;
+		while (name < eol && isspace(*name))
+			++name;
+		while (name < eol) {
+			len = hstrlen(name);
+			if (len == 0)
+				break;
+			if (len < MAX_HOSTLEN) {
+				struct hcent *ent;
+				if (c_alloc <= hcfile.c_len) {
+					struct hcent *c_data;
+					c_alloc += HCFILE_ALLOC_SIZE;
+					c_data = realloc(hcfile.c_data, c_alloc * sizeof(struct hcent));
+					if (!c_data) {
+						goto oom;
+					}
+					hcfile.c_data = c_data;
+				}
+				ent = hcfile.c_data + hcfile.c_len;
+				ent->addr = addr - h_addr;
+				ent->name = name - h_addr;
+				++hcfile.c_len;
+			}
+			name += len;
+			while (name < eol && isspace(*name))
+				++name;
+		}
+	}
+
+	qsort(hcfile.c_data, hcfile.c_len,
+	    sizeof(struct hcent), cmp_hcent_name);
+
+	flock(h_fd, LOCK_UN);
+
+	return 0;
+
+oom:
+	free(hcfile.c_data);
+	munmap(hcfile.h_data, hcfile.h_st.st_size);
+	flock(hcfile.h_fd, LOCK_UN);
+	close(hcfile.h_fd);
+	memset(&hcfile, 0, sizeof(struct hcfile));
+	return -1;
+}
+
+/*
+ * Caching version of getaddrinfo.
+ *
+ * If we find the requested host name in the cache, use getaddrinfo to
+ * populate the result for each address we find.
+ *
+ * Note glibc and bionic differ in the handling of ai_canonname.  POSIX
+ * says that ai_canonname is only populated in the first result entry.
+ * glibc does this.  bionic populates ai_canonname in all result entries.
+ * We choose the POSIX/glibc way here.
+ */
+int hc_getaddrinfo(const char *host, const char *service,
+		   const struct addrinfo *hints,
+		   struct addrinfo **result)
+{
+	int ret = 0;
+	struct hcent *ent, *cur;
+	struct addrinfo *ai;
+	struct addrinfo rhints;
+	struct addrinfo *last;
+	int canonname = 0;
+	int cmp;
+
+	if (getenv("ANDROID_HOSTS_CACHE_DISABLE") != NULL)
+		return EAI_SYSTEM;
+
+	/* Avoid needless work and recursion */
+	if (hints && (hints->ai_flags & AI_NUMERICHOST))
+		return EAI_SYSTEM;
+	if (!host)
+		return EAI_SYSTEM;
+
+	pthread_mutex_lock(&hclock);
+
+	if (_hcfilemmap() != 0) {
+		ret = EAI_SYSTEM;
+		goto out;
+	}
+	ent = _hcfindname(host);
+	if (!ent) {
+		ret = EAI_NONAME;
+		goto out;
+	}
+
+	if (hints) {
+		canonname = (hints->ai_flags & AI_CANONNAME);
+		memcpy(&rhints, hints, sizeof(rhints));
+		rhints.ai_flags &= ~AI_CANONNAME;
+	}
+	else {
+		memset(&rhints, 0, sizeof(rhints));
+	}
+	rhints.ai_flags |= AI_NUMERICHOST;
+
+	last = NULL;
+	cur = ent;
+	do {
+		char addrstr[MAX_ADDRLEN];
+		struct addrinfo *res;
+
+		hstrcpy(addrstr, hcfile.h_data + cur->addr);
+
+		if (getaddrinfo(addrstr, service, &rhints, &res) == 0) {
+			if (!last)
+				(*result)->ai_next = res;
+			else
+				last->ai_next = res;
+			last = res;
+			while (last->ai_next)
+				last = last->ai_next;
+		}
+
+		if(cur + 1 >= hcfile.c_data + hcfile.c_len)
+			break;
+		cmp = cmp_hcent_name(cur, cur + 1);
+		cur = cur + 1;
+	}
+	while (!cmp);
+
+	if (last == NULL) {
+		/* This check is equivalent to (*result)->ai_next == NULL */
+		ret = EAI_NODATA;
+		goto out;
+	}
+
+	if (canonname) {
+		ai = (*result)->ai_next;
+		free(ai->ai_canonname);
+		ai->ai_canonname = hstrdup(hcfile.h_data + ent->name);
+	}
+
+out:
+	pthread_mutex_unlock(&hclock);
+	return ret;
+}
+
+/*
+ * Caching version of gethtbyname.
+ *
+ * Note glibc and bionic differ in the handling of aliases.  glibc returns
+ * all aliases for all entries, regardless of whether they match h_addrtype.
+ * bionic returns only the aliases for the first hosts entry.  We return all
+ * aliases for all IPv4 entries.
+ *
+ * Additionally, if an alias is IPv6 and the primary name for an alias also
+ * has an IPv4 entry, glibc will return the IPv4 address(es), but bionic
+ * will not.  Neither do we.
+ */
+int hc_gethtbyname(const char *host, int af, struct getnamaddr *info)
+{
+	int ret = NETDB_SUCCESS;
+	struct hcent *ent, *cur;
+	int cmp;
+	size_t addrlen;
+	unsigned int naliases = 0;
+	char *aliases[MAXALIASES];
+	unsigned int naddrs = 0;
+	char *addr_ptrs[MAXADDRS];
+	unsigned int n;
+
+	if (getenv("ANDROID_HOSTS_CACHE_DISABLE") != NULL)
+		return NETDB_INTERNAL;
+
+	switch (af) {
+	case AF_INET:  addrlen = NS_INADDRSZ;  break;
+	case AF_INET6: addrlen = NS_IN6ADDRSZ; break;
+	default:
+		return NETDB_INTERNAL;
+	}
+
+	pthread_mutex_lock(&hclock);
+
+	if (_hcfilemmap() != 0) {
+		ret = NETDB_INTERNAL;
+		goto out;
+	}
+
+	ent = _hcfindname(host);
+	if (!ent) {
+		ret = HOST_NOT_FOUND;
+		goto out;
+	}
+
+	cur = ent;
+	do {
+		char addr[16];
+		char addrstr[MAX_ADDRLEN];
+		char namestr[MAX_HOSTLEN];
+		const char *name;
+
+		hstrcpy(addrstr, hcfile.h_data + cur->addr);
+		if (inet_pton(af, addrstr, &addr) == 1) {
+			char *aligned;
+			/* First match is considered the official hostname */
+			if (naddrs == 0) {
+				hstrcpy(namestr, hcfile.h_data + cur->name);
+				HENT_SCOPY(info->hp->h_name, namestr, info->buf, info->buflen);
+			}
+			for (name = hcfile.h_data + cur->name; name; name = _hcnextname(name)) {
+				if (!hstrcmp(name, host))
+					continue;
+				hstrcpy(namestr, name);
+				HENT_SCOPY(aliases[naliases], namestr, info->buf, info->buflen);
+				++naliases;
+				if (naliases >= MAXALIASES)
+					goto nospc;
+			}
+			aligned = (char *)ALIGN(info->buf);
+			if (info->buf != aligned) {
+				if ((ptrdiff_t)info->buflen < (aligned - info->buf))
+					goto nospc;
+				info->buflen -= (aligned - info->buf);
+				info->buf = aligned;
+			}
+			HENT_COPY(addr_ptrs[naddrs], addr, addrlen, info->buf, info->buflen);
+			++naddrs;
+			if (naddrs >= MAXADDRS)
+				goto nospc;
+		}
+
+		if(cur + 1 >= hcfile.c_data + hcfile.c_len)
+			break;
+		cmp = cmp_hcent_name(cur, cur + 1);
+		cur = cur + 1;
+	}
+	while (!cmp);
+
+	if (naddrs == 0) {
+		ret = HOST_NOT_FOUND;
+		goto out;
+	}
+
+	addr_ptrs[naddrs++] = NULL;
+	aliases[naliases++] = NULL;
+
+	/* hp->h_name already populated */
+	HENT_ARRAY(info->hp->h_aliases, naliases, info->buf, info->buflen);
+	for (n = 0; n < naliases; ++n) {
+		info->hp->h_aliases[n] = aliases[n];
+	}
+	info->hp->h_addrtype = af;
+	info->hp->h_length = addrlen;
+	HENT_ARRAY(info->hp->h_addr_list, naddrs, info->buf, info->buflen);
+	for (n = 0; n < naddrs; ++n) {
+		info->hp->h_addr_list[n] = addr_ptrs[n];
+	}
+
+out:
+	pthread_mutex_unlock(&hclock);
+	*info->he = ret;
+	return ret;
+
+nospc:
+	ret = NETDB_INTERNAL;
+	goto out;
+}
diff --git a/libc/dns/net/hosts_cache.h b/libc/dns/net/hosts_cache.h
new file mode 100644
index 0000000..fa5488f
--- /dev/null
+++ b/libc/dns/net/hosts_cache.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (C) 2016 The CyanogenMod Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+struct getnamaddr;
+
+int hc_getaddrinfo(const char *host, const char *service,
+		   const struct addrinfo *hints,
+		   struct addrinfo **result);
+
+int hc_gethtbyname(const char *host, int af, struct getnamaddr *info);
diff --git a/libc/dns/net/sethostent.c b/libc/dns/net/sethostent.c
index 916421e..be29621 100644
--- a/libc/dns/net/sethostent.c
+++ b/libc/dns/net/sethostent.c
@@ -55,6 +55,8 @@
 #include "hostent.h"
 #include "resolv_private.h"
 
+#include "hosts_cache.h"
+
 #define ALIGNBYTES (sizeof(uintptr_t) - 1)
 #define ALIGN(p) (((uintptr_t)(p) + ALIGNBYTES) &~ ALIGNBYTES)
 
@@ -99,6 +101,11 @@
 	/* NOSTRICT skip string len */(void)va_arg(ap, int);
 	af = va_arg(ap, int);
 
+	int rc = hc_gethtbyname(name, af, info);
+	if (rc != NETDB_INTERNAL) {
+		return (rc == NETDB_SUCCESS ? NS_SUCCESS : NS_NOTFOUND);
+	}
+
 #if 0
 	{
 		res_state res = __res_get_state();
diff --git a/libc/include/paths.h b/libc/include/paths.h
index 82c2804..7700cdd 100644
--- a/libc/include/paths.h
+++ b/libc/include/paths.h
@@ -33,6 +33,7 @@
 #define	_PATHS_H_
 
 #define	_PATH_BSHELL	"/system/bin/sh"
+#define	_PATH_BSHELL2	"/sbin/sh"
 #define	_PATH_CONSOLE	"/dev/console"
 #define	_PATH_DEFPATH	"/sbin:/vendor/bin:/system/sbin:/system/bin:/system/xbin"
 #define	_PATH_DEV	"/dev/"
diff --git a/libc/kernel/uapi/linux/android_alarm.h b/libc/kernel/uapi/linux/android_alarm.h
index 801a01e..9f2de28 100644
--- a/libc/kernel/uapi/linux/android_alarm.h
+++ b/libc/kernel/uapi/linux/android_alarm.h
@@ -28,28 +28,31 @@
 /* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
   ANDROID_ALARM_ELAPSED_REALTIME,
   ANDROID_ALARM_SYSTEMTIME,
+  ANDROID_ALARM_RTC_POWEROFF_WAKEUP,
   ANDROID_ALARM_TYPE_COUNT,
-};
 /* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
+};
 enum android_alarm_return_flags {
   ANDROID_ALARM_RTC_WAKEUP_MASK = 1U << ANDROID_ALARM_RTC_WAKEUP,
   ANDROID_ALARM_RTC_MASK = 1U << ANDROID_ALARM_RTC,
-  ANDROID_ALARM_ELAPSED_REALTIME_WAKEUP_MASK = 1U << ANDROID_ALARM_ELAPSED_REALTIME_WAKEUP,
 /* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
+  ANDROID_ALARM_ELAPSED_REALTIME_WAKEUP_MASK = 1U << ANDROID_ALARM_ELAPSED_REALTIME_WAKEUP,
   ANDROID_ALARM_ELAPSED_REALTIME_MASK = 1U << ANDROID_ALARM_ELAPSED_REALTIME,
   ANDROID_ALARM_SYSTEMTIME_MASK = 1U << ANDROID_ALARM_SYSTEMTIME,
+  ANDROID_ALARM_RTC_POWEROFF_WAKEUP_MASK = 1U << ANDROID_ALARM_RTC_POWEROFF_WAKEUP,
+/* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
   ANDROID_ALARM_TIME_CHANGE_MASK = 1U << 16
 };
-/* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
 #define ANDROID_ALARM_CLEAR(type) _IO('a', 0 | ((type) << 4))
 #define ANDROID_ALARM_WAIT _IO('a', 1)
+/* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
 #define ALARM_IOW(c,type,size) _IOW('a', (c) | ((type) << 4), size)
 #define ANDROID_ALARM_SET(type) ALARM_IOW(2, type, struct timespec)
-/* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
 #define ANDROID_ALARM_SET_AND_WAIT(type) ALARM_IOW(3, type, struct timespec)
 #define ANDROID_ALARM_GET_TIME(type) ALARM_IOW(4, type, struct timespec)
+/* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
 #define ANDROID_ALARM_SET_RTC _IOW('a', 5, struct timespec)
 #define ANDROID_ALARM_BASE_CMD(cmd) (cmd & ~(_IOC(0, 0, 0xf0, 0)))
-/* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
 #define ANDROID_ALARM_IOCTL_TO_TYPE(cmd) (_IOC_NR(cmd) >> 4)
 #endif
+/* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
diff --git a/libc/kernel/uapi/linux/time.h b/libc/kernel/uapi/linux/time.h
index bf245fc..5690d27 100644
--- a/libc/kernel/uapi/linux/time.h
+++ b/libc/kernel/uapi/linux/time.h
@@ -67,9 +67,10 @@
 #define CLOCK_SGI_CYCLE 10
 /* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
 #define CLOCK_TAI 11
+#define CLOCK_POWEROFF_ALARM 12
 #define MAX_CLOCKS 16
 #define CLOCKS_MASK (CLOCK_REALTIME | CLOCK_MONOTONIC)
-#define CLOCKS_MONO CLOCK_MONOTONIC
 /* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
+#define CLOCKS_MONO CLOCK_MONOTONIC
 #define TIMER_ABSTIME 0x01
 #endif
diff --git a/libc/upstream-netbsd/lib/libc/gen/popen.c b/libc/upstream-netbsd/lib/libc/gen/popen.c
index 593e346..b6ce47c 100644
--- a/libc/upstream-netbsd/lib/libc/gen/popen.c
+++ b/libc/upstream-netbsd/lib/libc/gen/popen.c
@@ -152,6 +152,8 @@
 		}
 
 		execl(_PATH_BSHELL, "sh", "-c", command, NULL);
+		if (errno == ENOENT)
+			execl(_PATH_BSHELL2, "sh", "-c", command, NULL);
 		_exit(127);
 		/* NOTREACHED */
 	}
diff --git a/libc/upstream-openbsd/lib/libc/gen/fnmatch.c b/libc/upstream-openbsd/lib/libc/gen/fnmatch.c
index 0d0f18f..ac6280e33 100644
--- a/libc/upstream-openbsd/lib/libc/gen/fnmatch.c
+++ b/libc/upstream-openbsd/lib/libc/gen/fnmatch.c
@@ -89,12 +89,72 @@
 #include <string.h>
 #include <ctype.h>
 
+#include <stdint.h>
+
 #include "charclass.h"
 
 #define	RANGE_MATCH	1
 #define	RANGE_NOMATCH	0
 #define	RANGE_ERROR	(-1)
 
+static unsigned int
+utf8_len(const char *s)
+{
+    const unsigned char *b = (const unsigned char *)s;
+    unsigned int len = 1;
+    unsigned char c;
+
+    if ((b[0] & 0xc0) != 0xc0) {
+        return 1;
+    }
+    c = b[0] << 1;
+    while (len < 6 && (c & 0x80)) {
+        if ((b[len] & 0xc0) != 0x80) {
+            return 1;
+        }
+        c <<= 1;
+        ++len;
+    }
+
+    return len;
+}
+
+static void
+utf8_inc(const char **s)
+{
+    *s += utf8_len(*s);
+}
+
+static uint32_t
+utf8_get_inc(const char **s)
+{
+    unsigned int len = utf8_len(*s);
+    const unsigned char *b = (const unsigned char *)(*s);
+    unsigned int n;
+    uint32_t ch;
+
+    *s += len;
+
+    if (len == 1) {
+        return b[0];
+    }
+
+    ch = b[0] & (0xff >> len);
+    for (n = 1; n < len; ++n) {
+        ch <<= 6;
+        ch |= (b[n] & 0x3f);
+    }
+
+    return ch;
+}
+
+static uint32_t
+utf8_get(const char *s)
+{
+    const char *tmp = s;
+    return utf8_get_inc(&tmp);
+}
+
 static int
 classmatch(const char *pattern, char test, int foldcase, const char **ep)
 {
@@ -149,7 +209,7 @@
     const int escape = !(flags & FNM_NOESCAPE);
     const int slash = !!(flags & FNM_PATHNAME);
     int result = FNM_NOMATCH;
-    const char *startch;
+    uint32_t startch, endch, compch;
     int negate;
 
     if (**pattern == '[')
@@ -170,7 +230,7 @@
             if (**pattern == ']') {
                 ++*pattern;
                 /* XXX: Fix for MBCS character width */
-                ++*string;
+                utf8_inc(string);
                 return (result ^ negate);
             }
 
@@ -200,10 +260,13 @@
              * "x-]" is not allowed unless escaped ("x-\]")
              * XXX: Fix for locale/MBCS character width
              */
-            if (((*pattern)[1] == '-') && ((*pattern)[2] != ']'))
+            startch = utf8_get_inc(pattern);
+            compch = utf8_get(*string);
+            if (((*pattern)[0] == '-') && ((*pattern)[1] != ']'))
             {
-                startch = *pattern;
-                *pattern += (escape && ((*pattern)[2] == '\\')) ? 3 : 2;
+                *pattern += 1;
+                if (escape && **pattern == '\\')
+                    *pattern += 1;
 
                 /* NOT a properly balanced [expr] pattern, EOS terminated 
                  * or ranges containing a slash in FNM_PATHNAME mode pattern
@@ -212,32 +275,35 @@
                 if (!**pattern || (slash && (**pattern == '/')))
                     break;
 
+                endch = utf8_get_inc(pattern);
+
+                /* Refuse to attempt collation for non-ASCII chars */
+                if (startch >= 0x80 || endch >= 0x80)
+                    continue;
+
                 /* XXX: handle locale/MBCS comparison, advance by MBCS char width */
-                if ((**string >= *startch) && (**string <= **pattern))
+                if ((compch >= startch) && (compch <= endch))
                     result = 0;
-                else if (nocase && (isupper((unsigned char)**string) ||
-			    isupper((unsigned char)*startch) ||
-                            isupper((unsigned char)**pattern))
-                            && (tolower((unsigned char)**string) >=
-			        tolower((unsigned char)*startch)) 
-                            && (tolower((unsigned char)**string) <=
-				tolower((unsigned char)**pattern)))
+                else if (nocase && (isupper(compch) ||
+			    isupper(startch) ||
+                            isupper(endch))
+                            && (tolower(compch) >=
+			        tolower(startch))
+                            && (tolower(compch) <=
+				tolower(endch)))
                     result = 0;
 
-                ++*pattern;
                 continue;
             }
 
             /* XXX: handle locale/MBCS comparison, advance by MBCS char width */
-            if ((**string == **pattern))
+            if (compch == startch)
                 result = 0;
-            else if (nocase && (isupper((unsigned char)**string) ||
-			    isupper((unsigned char)**pattern))
-                            && (tolower((unsigned char)**string) ==
-				tolower((unsigned char)**pattern)))
+            else if (nocase && (isupper(compch) ||
+			    isupper(startch))
+                            && (tolower(compch) ==
+				tolower(startch)))
                 result = 0;
-
-            ++*pattern;
         }
 
         /* NOT a properly balanced [expr] pattern; Rewind
@@ -258,7 +324,7 @@
     }
 
     /* XXX: handle locale/MBCS comparison, advance by the MBCS char width */
-    if (**string == **pattern)
+    if (utf8_get(*string) == utf8_get(*pattern))
         result = 0;
     else if (nocase && (isupper((unsigned char)**string) ||
 		    isupper((unsigned char)**pattern))
@@ -272,8 +338,8 @@
         return result;
 
 fnmatch_ch_success:
-    ++*pattern;
-    ++*string;
+    utf8_inc(pattern);
+    utf8_inc(string);
     return result;
 }
 
diff --git a/linker/Android.mk b/linker/Android.mk
index 4a4ca5c..65dd7dd 100644
--- a/linker/Android.mk
+++ b/linker/Android.mk
@@ -47,16 +47,32 @@
 LOCAL_CONLYFLAGS += \
     -std=gnu99 \
 
+ifneq ($(TARGET_NEEDS_PRELINK_SUPPORT),true)
 LOCAL_CPPFLAGS += \
-    -Wold-style-cast \
+    -Wold-style-cast
+else
+  LOCAL_CFLAGS += -DENABLE_PRELINK_SUPPORT
+endif
 
 ifeq ($(TARGET_IS_64_BIT),true)
 LOCAL_CPPFLAGS += -DTARGET_IS_64_BIT
 endif
 
+ifeq ($(TARGET_NEEDS_PLATFORM_TEXT_RELOCATIONS),true)
+ifeq ($(user_variant),user)
+$(error Do not enable text relocations on user builds)
+else
+LOCAL_CPPFLAGS += -DTARGET_NEEDS_PLATFORM_TEXT_RELOCATIONS
+endif
+endif
+
 # We need to access Bionic private headers in the linker.
 LOCAL_CFLAGS += -I$(LOCAL_PATH)/../libc/
 
+ifeq ($(TARGET_NEEDS_NON_PIE_SUPPORT),true)
+  LOCAL_CFLAGS += -DENABLE_NON_PIE_SUPPORT
+endif
+
 # we don't want crtbegin.o (because we have begin.o), so unset it
 # just for this module
 LOCAL_NO_CRT := true
diff --git a/linker/linker.cpp b/linker/linker.cpp
index 9dc928e..583dfd1 100644
--- a/linker/linker.cpp
+++ b/linker/linker.cpp
@@ -1230,6 +1230,67 @@
 typedef linked_list_t<const char> StringLinkedList;
 typedef std::vector<LoadTask*> LoadTaskList;
 
+static soinfo* find_library(android_namespace_t* ns,
+                           const char* name, int rtld_flags,
+                           const android_dlextinfo* extinfo,
+                           soinfo* needed_by);
+
+// g_ld_all_shim_libs maintains the references to memory as it used
+// in the soinfo structures and in the g_active_shim_libs list.
+
+typedef std::pair<std::string, std::string> ShimDescriptor;
+static std::vector<ShimDescriptor> g_ld_all_shim_libs;
+
+// g_active_shim_libs are all shim libs that are still eligible
+// to be loaded.  We must remove a shim lib from the list before
+// we load the library to avoid recursive loops (load shim libA
+// for libB where libA also links against libB).
+
+static linked_list_t<const ShimDescriptor> g_active_shim_libs;
+
+static void reset_g_active_shim_libs(void) {
+  g_active_shim_libs.clear();
+  for (const auto& pair : g_ld_all_shim_libs) {
+    g_active_shim_libs.push_back(&pair);
+  }
+}
+
+static void parse_LD_SHIM_LIBS(const char* path) {
+  g_ld_all_shim_libs.clear();
+  if (path != nullptr) {
+    // We have historically supported ':' as well as ' ' in LD_SHIM_LIBS.
+    for (const auto& pair : android::base::Split(path, " :")) {
+      size_t pos = pair.find('|');
+      if (pos > 0 && pos < pair.length() - 1) {
+        auto desc = std::pair<std::string, std::string>(pair.substr(0, pos), pair.substr(pos + 1));
+        g_ld_all_shim_libs.push_back(desc);
+      }
+    }
+  }
+  reset_g_active_shim_libs();
+}
+
+template<typename F>
+static void for_each_matching_shim(const char *const path, F action) {
+  if (path == nullptr) return;
+  INFO("Finding shim libs for \"%s\"\n", path);
+  std::vector<const ShimDescriptor *> matched;
+
+  g_active_shim_libs.for_each([&](const ShimDescriptor *a_pair) {
+    if (a_pair->first == path) {
+      matched.push_back(a_pair);
+    }
+  });
+
+  g_active_shim_libs.remove_if([&](const ShimDescriptor *a_pair) {
+    return a_pair->first == path;
+  });
+
+  for (const auto& one_pair : matched) {
+    INFO("Injecting shim lib \"%s\" as needed for %s", one_pair->second.c_str(), path);
+    action(one_pair->second.c_str());
+  }
+}
 
 // This function walks down the tree of soinfo dependencies
 // in breadth-first order and
@@ -1668,6 +1729,7 @@
 
 template<typename F>
 static void for_each_dt_needed(const soinfo* si, F action) {
+  for_each_matching_shim(si->get_realpath(), action);
   for (const ElfW(Dyn)* d = si->dynamic; d->d_tag != DT_NULL; ++d) {
     if (d->d_tag == DT_NEEDED) {
       action(fix_dt_needed(si->get_string(d->d_un.d_val), si->get_realpath()));
@@ -1677,6 +1739,7 @@
 
 template<typename F>
 static void for_each_dt_needed(const ElfReader& elf_reader, F action) {
+  for_each_matching_shim(elf_reader.name(), action);
   for (const ElfW(Dyn)* d = elf_reader.dynamic(); d->d_tag != DT_NULL; ++d) {
     if (d->d_tag == DT_NEEDED) {
       action(fix_dt_needed(elf_reader.get_string(d->d_un.d_val), elf_reader.name()));
@@ -2397,6 +2460,7 @@
   }
 
   ProtectedDataGuard guard;
+  reset_g_active_shim_libs();
   soinfo* si = find_library(ns, translated_name, flags, extinfo, caller);
   if (si != nullptr) {
     si->call_constructors();
@@ -2763,10 +2827,10 @@
 
     const ElfW(Sym)* s = nullptr;
     soinfo* lsi = nullptr;
+    const version_info* vi = nullptr;
 
     if (sym != 0) {
       sym_name = get_string(symtab_[sym].st_name);
-      const version_info* vi = nullptr;
 
       if (!lookup_version_info(version_tracker, sym, sym_name, &vi)) {
         return false;
@@ -3068,6 +3132,7 @@
         *reinterpret_cast<ElfW(Addr)*>(reloc) += sym_addr - rel->r_offset;
         break;
       case R_ARM_COPY:
+#ifndef ENABLE_NON_PIE_SUPPORT
         /*
          * ET_EXEC is not supported so this should not happen.
          *
@@ -3079,6 +3144,50 @@
          */
         DL_ERR("%s R_ARM_COPY relocations are not supported", get_realpath());
         return false;
+#else
+        if ((flags_ & FLAG_EXE) == 0) {
+            /*
+             * http://infocenter.arm.com/help/topic/com.arm.doc.ihi0044d/IHI0044D_aaelf.pdf
+             *
+             * Section 4.6.1.10 "Dynamic relocations"
+             * R_ARM_COPY may only appear in executable objects where e_type is
+             * set to ET_EXEC.
+             *
+             * TODO: FLAG_EXE is set for both ET_DYN and ET_EXEC executables.
+             * We should explicitly disallow ET_DYN executables from having
+             * R_ARM_COPY relocations.
+             */
+            DL_ERR("%s R_ARM_COPY relocations only supported for ET_EXEC", get_realpath());
+            return false;
+        }
+        count_relocation(kRelocCopy);
+        MARK(rel->r_offset);
+        TRACE_TYPE(RELO, "RELO %08x <- %d @ %08x %s", reloc, s->st_size, sym_addr, sym_name);
+        if (reloc == sym_addr) {
+            const ElfW(Sym)* src = nullptr;
+
+            if (!soinfo_do_lookup(NULL, sym_name, vi, &lsi, global_group, local_group, &src)) {
+                DL_ERR("%s R_ARM_COPY relocation source cannot be resolved", get_realpath());
+                return false;
+            }
+            if (lsi->has_DT_SYMBOLIC) {
+                DL_ERR("%s invalid R_ARM_COPY relocation against DT_SYMBOLIC shared "
+                       "library %s (built with -Bsymbolic?)", get_realpath(), lsi->soname_);
+                return false;
+            }
+            if (s->st_size < src->st_size) {
+                DL_ERR("%s R_ARM_COPY relocation size mismatch (%d < %d)",
+                       get_realpath(), s->st_size, src->st_size);
+                return false;
+            }
+            memcpy(reinterpret_cast<void*>(reloc),
+                   reinterpret_cast<void*>(src->st_value + lsi->load_bias), src->st_size);
+        } else {
+            DL_ERR("%s R_ARM_COPY relocation target cannot be resolved", get_realpath());
+            return false;
+        }
+        break;
+#endif
 #elif defined(__i386__)
       case R_386_32:
         count_relocation(kRelocRelative);
@@ -3950,11 +4059,18 @@
 #if !defined(__LP64__)
   if (has_text_relocations) {
     // Fail if app is targeting sdk version > 22
+#if !defined(__i386__) // ffmpeg says that they require text relocations on x86
+#if defined(TARGET_NEEDS_PLATFORM_TEXT_RELOCATIONS)
+    if (get_application_target_sdk_version() != __ANDROID_API__
+        && get_application_target_sdk_version() > 22) {
+#else
     if (get_application_target_sdk_version() > 22) {
+#endif
       PRINT("%s: has text relocations", get_realpath());
       DL_ERR("%s: has text relocations", get_realpath());
       return false;
     }
+#endif
     // Make segments writable to allow text relocations to work properly. We will later call
     // phdr_table_protect_segments() after all of them are applied.
     DL_WARN("%s has text relocations. This is wasting memory and prevents "
@@ -4217,6 +4333,7 @@
   // doesn't cost us anything.
   const char* ldpath_env = nullptr;
   const char* ldpreload_env = nullptr;
+  const char* ldshim_libs_env = nullptr;
   if (!getauxval(AT_SECURE)) {
     ldpath_env = getenv("LD_LIBRARY_PATH");
     if (ldpath_env != nullptr) {
@@ -4226,6 +4343,7 @@
     if (ldpreload_env != nullptr) {
       INFO("[ LD_PRELOAD set to \"%s\" ]", ldpreload_env);
     }
+    ldshim_libs_env = getenv("LD_SHIM_LIBS");
   }
 
   struct stat file_stat;
@@ -4275,15 +4393,18 @@
   }
   si->dynamic = nullptr;
 
+#ifndef ENABLE_NON_PIE_SUPPORT
   ElfW(Ehdr)* elf_hdr = reinterpret_cast<ElfW(Ehdr)*>(si->base);
   if (elf_hdr->e_type != ET_DYN) {
     __libc_fatal("\"%s\": error: only position independent executables (PIE) are supported.",
                  args.argv[0]);
   }
+#endif
 
   // Use LD_LIBRARY_PATH and LD_PRELOAD (but only if we aren't setuid/setgid).
   parse_LD_LIBRARY_PATH(ldpath_env);
   parse_LD_PRELOAD(ldpreload_env);
+  parse_LD_SHIM_LIBS(ldshim_libs_env);
 
   somain = si;
 
diff --git a/linker/linker_phdr.cpp b/linker/linker_phdr.cpp
index 9ed612f..5e20788 100644
--- a/linker/linker_phdr.cpp
+++ b/linker/linker_phdr.cpp
@@ -139,7 +139,11 @@
 ElfReader::ElfReader()
     : did_read_(false), did_load_(false), fd_(-1), file_offset_(0), file_size_(0), phdr_num_(0),
       phdr_table_(nullptr), shdr_table_(nullptr), shdr_num_(0), dynamic_(nullptr), strtab_(nullptr),
+#ifdef ENABLE_PRELINK_SUPPORT
+      strtab_size_(0), load_start_(nullptr), load_size_(0), load_bias_(0), required_base_(0), loaded_phdr_(nullptr),
+#else
       strtab_size_(0), load_start_(nullptr), load_size_(0), load_bias_(0), loaded_phdr_(nullptr),
+#endif
       mapped_by_caller_(false) {
 }
 
@@ -423,6 +427,38 @@
   return max_vaddr - min_vaddr;
 }
 
+#ifdef ENABLE_PRELINK_SUPPORT
+typedef struct {
+    long mmap_addr;
+    char tag[4]; /* 'P', 'R', 'E', ' ' */
+} prelink_info_t;
+
+/* Returns the requested base address if the library is prelinked,
+ * and 0 otherwise.  */
+static ElfW(Addr) is_prelinked(int fd, const char *name)
+{
+    off_t sz = lseek(fd, -sizeof(prelink_info_t), SEEK_END);
+    if (sz < 0) {
+        DL_ERR("lseek() failed!");
+        return 0;
+    }
+
+    prelink_info_t info;
+    int rc = TEMP_FAILURE_RETRY(read(fd, &info, sizeof(info)));
+    if (rc != sizeof(info)) {
+        DL_ERR("Could not read prelink_info_t structure for `%s`\n", name);
+        return 0;
+    }
+
+    if (memcmp(info.tag, "PRE ", 4)) {
+        DL_ERR("`%s` is not a prelinked library\n", name);
+        return 0;
+    }
+
+    return (unsigned long)info.mmap_addr;
+}
+#endif
+
 // Reserve a virtual address range big enough to hold all loadable
 // segments of a program header table. This is done by creating a
 // private anonymous mmap() with PROT_NONE.
@@ -465,6 +501,13 @@
       return false;
     }
     int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS;
+#ifdef ENABLE_PRELINK_SUPPORT
+    required_base_ = is_prelinked(fd_, name_.c_str());
+    if (required_base_ != 0) {
+      mmap_flags |= MAP_FIXED;
+      mmap_hint = (uint8_t*) required_base_;
+    }
+#endif
     start = mmap(mmap_hint, load_size_, PROT_NONE, mmap_flags, -1, 0);
     if (start == MAP_FAILED) {
       DL_ERR("couldn't reserve %zd bytes of address space for \"%s\"", load_size_, name_.c_str());
diff --git a/linker/linker_phdr.h b/linker/linker_phdr.h
index d6276ed..94f669b 100644
--- a/linker/linker_phdr.h
+++ b/linker/linker_phdr.h
@@ -50,6 +50,9 @@
   ElfW(Addr) load_start() const { return reinterpret_cast<ElfW(Addr)>(load_start_); }
   size_t load_size() const { return load_size_; }
   ElfW(Addr) load_bias() const { return load_bias_; }
+#ifdef ENABLE_PRELINK_SUPPORT
+  ElfW(Addr) required_base() { return required_base_; }
+#endif
   const ElfW(Phdr)* loaded_phdr() const { return loaded_phdr_; }
   const ElfW(Dyn)* dynamic() const { return dynamic_; }
   const char* get_string(ElfW(Word) index) const;
@@ -97,6 +100,11 @@
   size_t load_size_;
   // Load bias.
   ElfW(Addr) load_bias_;
+#ifdef ENABLE_PRELINK_SUPPORT
+  // For prelinked libraries, mandatory load address of the first
+  // loadable segment. 0 otherwise.
+  ElfW(Addr) required_base_;
+#endif
 
   // Loaded phdr.
   const ElfW(Phdr)* loaded_phdr_;