bionic: squashed optimizations/fixes from Jim Huang

*Commit 1 of 9*
Use GCC's __attribute__((const)) to reduce code size

__attribute__((const)) is mainly intended for the compiler to optimize
away repeated calls to a function that the compiler knows will return
the same value repeatedly.

By adding __attribute__((const)), the compiler can choose to call the
function just once and cache the return value.  Therefore, this yields
code size reduction.

Here are the reference results by arm-eabi-size for crespo device:

[before]
   text    data     bss     dec     hex filename
 267715   10132   45948  323795   4f0d3

[after]
   text    data     bss     dec     hex filename
 267387   10132   45948  323467   4ef8b

Change-Id: I1d80465c0f88158449702d4dc6398a130eb77195

*Commit 2 of 9*
res_send: Avoid spurious close()s and (rare) failure

When looping over the current list of sockets we are connected to,
use getpeername() not getsockname() to find out who the remote
end is.  This change avoids spurious close() and (rare) failure.

ISC bug #18625 and fixed in libbind 6.0

Change-Id: I5e85f9ff4b98c237978e4bf4bd85ba0a90d768e6

*Commit 3 of 9*
sha1: code cleanup and use modern C syntax

Apply the following changes:
- Remove out-of-date workaround (SPARC64_GCC_WORKAROUND)
- Use C99 prototype and stdint type

Change-Id: I630cf97f6824f72f4165e0fa9e5bfdad8edabe48

*Commit 4 of 9*
sha1: Use bswap* to optimize byte order

bionic libc already makes use of ARMv6+ rev/rev16 instruction for
endian conversion, and this patch rewrites some parts of SHA1
implementations with swap32 and swap64 routines, which is known to
bring performance improvements.

The reference sha1bench on Nexus S:

[before]
Rounds: 100000, size: 6250K, time: 1.183s, speed: 5.16  MB/s

[after]
Rounds: 100000, size: 6250K, time: 1.025s, speed: 5.957 MB/sB

Change-Id: Id04c0fa1467b3006b5a8736cbdd95855ed7c13e4

*Commit 5 of 9*
linker: optimize SysV ELF hash function

This change can avoid one iterative operation in inner loop.
Inspired by glibc.

Change-Id: I3f641c086654809574289fa6eba0ee1d32e79aa3

*Commit 6 of 9*
Add ARMv7 optimized strlen()

Merge the ARM optimized strlen() routine from Linaro.  Although it is
optimized for ARM Cortex-A9, the performance is still reasonably faster
than the original on Cortex-A8 machines.

Reference benchmark on Nexus S (ARM Cortex-A8; 1 GHz):

[before]
             prc thr   usecs/call      samples   errors cnt/samp     size
strlen_1k      1   1      1.31712           97        0     1000     1024

[after]
             prc thr   usecs/call      samples   errors cnt/samp     size
strlen_1k      1   1      1.05855           96        0     1000     1024

Change-Id: I809928804726620f399510af1cd1c852ed754403

*Commit 7 of 9*
fix ARMv7 optimized strlen() usage condition (author: nadlabak)

Change-Id: Ia2ab059b092f80c02d95ca95d3062954c0ad1023

*Commit 8 of 9*
memmove: Fix the abuse of memcpy() for overlapping regions

memcpy is not defined for overlapping regions.

Original author: Chris Dearman <chris@mips.com>

Change-Id: Icc2acc860c932eaf1df488630146f4e07388a444

*Commit 9 of 9*
memcmp: prefetch optimizing for ARM Cortex-A8/A9

The original memcmp() was tweaked for ARM9, which is not optimal for ARM
Cortex-A cores.  This patch merges the prefetch optimizations from
ST-Ericsson and removes NEON slowdowns.

Reference experiement results on Nexus S (ARM Cortex-A8; 1 GHz) using
strbench program:
    http://pasky.or.cz//dev/glibc/strbench/

[before]
size, samples, TIMES[s] - user, system, total)
   4   262144         2.510000 0.000000 2.510000
   8   131072         1.570000 0.010000 1.590000
  32    32768         1.310000 0.000000 1.320000

[after]
size, samples, TIMES[s] - user, system, total)
   4   262144         2.280000 0.000000 2.290000
   8   131072         1.210000 0.000000 1.220000
  32    32768         1.040000 0.000000 1.050000

Change-Id: I961847da96d2025f7049773cd2ddaa08579e78d6
diff --git a/libc/Android.mk b/libc/Android.mk
index 59b2f0c..82ef6ec 100644
--- a/libc/Android.mk
+++ b/libc/Android.mk
@@ -359,7 +359,6 @@
 	arch-arm/bionic/memset.S \
 	arch-arm/bionic/setjmp.S \
 	arch-arm/bionic/sigsetjmp.S \
-	arch-arm/bionic/strlen.c.arm \
 	arch-arm/bionic/strcpy.S \
 	arch-arm/bionic/strcmp.S \
 	arch-arm/bionic/syscall.S \
@@ -367,6 +366,11 @@
 	string/bcopy.c \
 	string/strncmp.c \
 	unistd/socketcalls.c
+ifeq ($(ARCH_ARM_HAVE_ARMV7A),true)
+libc_common_src_files += arch-arm/bionic/strlen-armv7.S
+else
+libc_common_src_files += arch-arm/bionic/strlen.c.arm
+endif
 
 # These files need to be arm so that gdbserver
 # can set breakpoints in them without messing
diff --git a/libc/arch-arm/bionic/memcmp.S b/libc/arch-arm/bionic/memcmp.S
index c872a51..781c4f8 100644
--- a/libc/arch-arm/bionic/memcmp.S
+++ b/libc/arch-arm/bionic/memcmp.S
@@ -1,5 +1,6 @@
 /*
- * Copyright (C) 2008 The Android Open Source Project
+ * Copyright (C) 2008, 2011 The Android Open Source Project
+ * Copyright (C) 2010 ST-Ericsson SA
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -30,43 +31,71 @@
 #include <machine/asm.h>
 
 /*
- * Optimized memcmp() for ARM9.
- * This would not be optimal on XScale or ARM11, where more prefetching
- * and use of PLD will be needed.
- * The 2 major optimzations here are
- * (1) The main loop compares 16 bytes at a time
- * (2) The loads are scheduled in a way they won't stall
+ * Optimized memcmp() for ARM9 and Cortex-A9
  */
 
+#if __ARM_ARCH__ >= 7
+#define __ARM_CORTEX
+
+#if defined(CORTEX_CACHE_LINE_32)
+#define CACHE_LINE_SIZE     32
+#else
+#define CACHE_LINE_SIZE     64
+#endif
+
+#endif /* __ARM_ARCH__ */
+
 ENTRY(memcmp)
+#if defined(__ARM_CORTEX)
+        pld         [r0, #(CACHE_LINE_SIZE * 0)]
+        pld         [r0, #(CACHE_LINE_SIZE * 1)]
+#else
         PLD         (r0, #0)
         PLD         (r1, #0)
+#endif
 
         /* take of the case where length is 0 or the buffers are the same */
         cmp         r0, r1
+#if !defined(__ARM_CORTEX)
         cmpne       r2, #0
+#endif
         moveq       r0, #0
         bxeq        lr
 
+#if defined(__ARM_CORTEX)
+        pld         [r1, #(CACHE_LINE_SIZE * 0)]
+        pld         [r1, #(CACHE_LINE_SIZE * 1)]
+
+        /* make sure we have at least 8+4 bytes, this simplify things below
+         * and avoid some overhead for small blocks
+         */
+        cmp        r2, #(8+4)
+        bmi        10f
+#endif /* __ARM_CORTEX */
+
         .save {r4, lr}
         /* save registers */
         stmfd       sp!, {r4, lr}
-        
+
+#if !defined(__ARM_CORTEX)
         PLD         (r0, #32)
         PLD         (r1, #32)
+#endif
 
         /* since r0 hold the result, move the first source
          * pointer somewhere else
          */
          
          mov        r4, r0
-         
+
+#if !defined(__ARM_CORTEX)
          /* make sure we have at least 8+4 bytes, this simplify things below
           * and avoid some overhead for small blocks
           */
          cmp        r2, #(8+4)
          bmi        8f
-        
+#endif
+
         /* align first pointer to word boundary
          * offset = -src & 3
          */
@@ -103,8 +132,14 @@
         subs        r2, r2, #(32 + 4)
         bmi         1f
         
-0:      PLD         (r4, #64)
+0:
+#if defined(__ARM_CORTEX)
+        pld         [r4, #(CACHE_LINE_SIZE * 2)]
+        pld         [r1, #(CACHE_LINE_SIZE * 2)]
+#else
+        PLD         (r4, #64)
         PLD         (r1, #64)
+#endif
         ldr         r0, [r4], #4
         ldr         lr, [r1, #4]!
         eors        r0, r0, ip
@@ -170,6 +205,22 @@
 9:      /* restore registers and return */
         ldmfd       sp!, {r4, lr}
         bx          lr
+
+#if defined(__ARM_CORTEX)
+10:     /* process less than 12 bytes */
+        cmp         r2, #0
+        moveq       r0, #0
+        bxeq        lr
+        mov         r3, r0
+11:
+        ldrb        r0, [r3], #1
+        ldrb        ip, [r1], #1
+        subs        r0, ip
+        bxne        lr
+        subs        r2, r2, #1
+        bne         11b
+        bx          lr
+#endif /* __ARM_CORTEX */
 END(memcmp)
 
 
@@ -192,8 +243,14 @@
         bic         r1, r1, #3
         ldr         lr, [r1], #4
 
-6:      PLD         (r1, #64)
+6:
+#if defined(__ARM_CORTEX)
+        pld         [r1, #(CACHE_LINE_SIZE * 2)]
+        pld         [r4, #(CACHE_LINE_SIZE * 2)]
+#else
+        PLD         (r1, #64)
         PLD         (r4, #64)
+#endif
         mov         ip, lr, lsr #16
         ldr         lr, [r1], #4
         ldr         r0, [r4], #4
diff --git a/libc/arch-arm/bionic/strlen-armv7.S b/libc/arch-arm/bionic/strlen-armv7.S
new file mode 100644
index 0000000..125e92f
--- /dev/null
+++ b/libc/arch-arm/bionic/strlen-armv7.S
@@ -0,0 +1,111 @@
+/* Copyright (c) 2010-2011, Linaro Limited
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+      * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+      * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+      * Neither the name of Linaro Limited nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   Written by Dave Gilbert <david.gilbert@linaro.org>
+
+   This strlen routine is optimised on a Cortex-A9 and should work on
+   all ARMv7 processors.   This routine is reasonably fast for short
+   strings, but is probably slower than a simple implementation if all
+   your strings are very short */
+
+@ 2011-02-08 david.gilbert@linaro.org
+@    Extracted from local git 6848613a
+
+
+@ this lets us check a flag in a 00/ff byte easily in either endianness
+#ifdef __ARMEB__
+#define CHARTSTMASK(c) 1<<(31-(c*8))
+#else
+#define CHARTSTMASK(c) 1<<(c*8)
+#endif
+
+@-----------------------------------------------------------------------------------------------------------------------------
+	.syntax unified
+	.arch armv7-a
+
+	.thumb_func
+	.align 2
+	.p2align 4,,15
+	.global strlen
+	.type strlen,%function
+strlen:
+	@ r0 = string
+	@ returns count of bytes in string not including terminator
+	mov	r1, r0
+	push	{ r4,r6 }
+	mvns	r6, #0		@ all F
+	movs	r4, #0
+	tst	r0, #7
+	beq	2f
+
+1:
+	ldrb	r2, [r1], #1
+	tst	r1, #7		@ Hit alignment yet?
+	cbz	r2, 10f		@ Exit if we found the 0
+	bne	1b
+
+	@ So we're now aligned
+2:
+	ldmia	r1!,{r2,r3}
+	uadd8	r2, r2, r6	@ Parallel add 0xff - sets the GE bits for anything that wasn't 0
+	sel	r2, r4, r6	@ bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
+	uadd8	r3, r3, r6	@ Parallel add 0xff - sets the GE bits for anything that wasn't 0
+	sel	r3, r2, r6	@ bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
+	cmp	r3, #0
+	beq	2b
+
+strlenendtmp:
+	@ One (or more) of the bytes we loaded was 0 - but which one?
+	@ r2 has the mask corresponding to the first loaded word
+	@ r3 has a combined mask of the two words - but if r2 was all-non 0 
+	@ then it's just the 2nd words
+	cmp	r2, #0
+	itte	eq
+	moveq	r2, r3		@ the end is in the 2nd word
+	subeq	r1,r1,#3
+	subne	r1,r1,#7
+
+	@ r1 currently points to the 2nd byte of the word containing the 0
+	tst	r2, # CHARTSTMASK(0)	@ 1st character
+	bne	10f
+	adds	r1,r1,#1
+	tst	r2, # CHARTSTMASK(1)	@ 2nd character
+	ittt	eq
+	addeq	r1,r1,#1
+	tsteq	r2, # (3<<15)	@ 2nd & 3rd character
+	@ If not the 3rd must be the last one
+	addeq	r1,r1,#1
+
+10:
+	@ r0 is still at the beginning, r1 is pointing 1 byte after the terminator
+	sub	r0, r1, r0
+	subs	r0, r0, #1
+	pop	{ r4, r6 }
+	bx	lr
diff --git a/libc/bionic/sha1.c b/libc/bionic/sha1.c
index efa95a5..a4fbd67 100644
--- a/libc/bionic/sha1.c
+++ b/libc/bionic/sha1.c
@@ -22,10 +22,7 @@
 #include <assert.h>
 #include <sha1.h>
 #include <string.h>
-
-#if HAVE_NBTOOL_CONFIG_H
-#include "nbtool_config.h"
-#endif
+#include <endian.h>
 
 #if !HAVE_SHA1_H
 
@@ -36,8 +33,7 @@
  * I got the idea of expanding during the round function from SSLeay
  */
 #if BYTE_ORDER == LITTLE_ENDIAN
-# define blk0(i) (block->l[i] = (rol(block->l[i],24)&0xFF00FF00) \
-    |(rol(block->l[i],8)&0x00FF00FF))
+# define blk0(i) swap32(block->l[i])
 #else
 # define blk0(i) block->l[i]
 #endif
@@ -54,77 +50,17 @@
 #define R4(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0xCA62C1D6+rol(v,5);w=rol(w,30);
 
 typedef union {
-    u_char c[64];
-    u_int l[16];
+    uint8_t c[SHA1_BLOCK_SIZE];
+    uint32_t l[SHA1_BLOCK_SIZE/4];
 } CHAR64LONG16;
 
-/* old sparc64 gcc could not compile this */
-#undef SPARC64_GCC_WORKAROUND
-#if defined(__sparc64__) && defined(__GNUC__) && __GNUC__ < 3
-#define SPARC64_GCC_WORKAROUND
-#endif
-
-#ifdef SPARC64_GCC_WORKAROUND
-void do_R01(u_int32_t *a, u_int32_t *b, u_int32_t *c, u_int32_t *d, u_int32_t *e, CHAR64LONG16 *);
-void do_R2(u_int32_t *a, u_int32_t *b, u_int32_t *c, u_int32_t *d, u_int32_t *e, CHAR64LONG16 *);
-void do_R3(u_int32_t *a, u_int32_t *b, u_int32_t *c, u_int32_t *d, u_int32_t *e, CHAR64LONG16 *);
-void do_R4(u_int32_t *a, u_int32_t *b, u_int32_t *c, u_int32_t *d, u_int32_t *e, CHAR64LONG16 *);
-
-#define nR0(v,w,x,y,z,i) R0(*v,*w,*x,*y,*z,i)
-#define nR1(v,w,x,y,z,i) R1(*v,*w,*x,*y,*z,i)
-#define nR2(v,w,x,y,z,i) R2(*v,*w,*x,*y,*z,i)
-#define nR3(v,w,x,y,z,i) R3(*v,*w,*x,*y,*z,i)
-#define nR4(v,w,x,y,z,i) R4(*v,*w,*x,*y,*z,i)
-
-void
-do_R01(u_int32_t *a, u_int32_t *b, u_int32_t *c, u_int32_t *d, u_int32_t *e, CHAR64LONG16 *block)
-{
-    nR0(a,b,c,d,e, 0); nR0(e,a,b,c,d, 1); nR0(d,e,a,b,c, 2); nR0(c,d,e,a,b, 3);
-    nR0(b,c,d,e,a, 4); nR0(a,b,c,d,e, 5); nR0(e,a,b,c,d, 6); nR0(d,e,a,b,c, 7);
-    nR0(c,d,e,a,b, 8); nR0(b,c,d,e,a, 9); nR0(a,b,c,d,e,10); nR0(e,a,b,c,d,11);
-    nR0(d,e,a,b,c,12); nR0(c,d,e,a,b,13); nR0(b,c,d,e,a,14); nR0(a,b,c,d,e,15);
-    nR1(e,a,b,c,d,16); nR1(d,e,a,b,c,17); nR1(c,d,e,a,b,18); nR1(b,c,d,e,a,19);
-}
-
-void
-do_R2(u_int32_t *a, u_int32_t *b, u_int32_t *c, u_int32_t *d, u_int32_t *e, CHAR64LONG16 *block)
-{
-    nR2(a,b,c,d,e,20); nR2(e,a,b,c,d,21); nR2(d,e,a,b,c,22); nR2(c,d,e,a,b,23);
-    nR2(b,c,d,e,a,24); nR2(a,b,c,d,e,25); nR2(e,a,b,c,d,26); nR2(d,e,a,b,c,27);
-    nR2(c,d,e,a,b,28); nR2(b,c,d,e,a,29); nR2(a,b,c,d,e,30); nR2(e,a,b,c,d,31);
-    nR2(d,e,a,b,c,32); nR2(c,d,e,a,b,33); nR2(b,c,d,e,a,34); nR2(a,b,c,d,e,35);
-    nR2(e,a,b,c,d,36); nR2(d,e,a,b,c,37); nR2(c,d,e,a,b,38); nR2(b,c,d,e,a,39);
-}
-
-void
-do_R3(u_int32_t *a, u_int32_t *b, u_int32_t *c, u_int32_t *d, u_int32_t *e, CHAR64LONG16 *block)
-{
-    nR3(a,b,c,d,e,40); nR3(e,a,b,c,d,41); nR3(d,e,a,b,c,42); nR3(c,d,e,a,b,43);
-    nR3(b,c,d,e,a,44); nR3(a,b,c,d,e,45); nR3(e,a,b,c,d,46); nR3(d,e,a,b,c,47);
-    nR3(c,d,e,a,b,48); nR3(b,c,d,e,a,49); nR3(a,b,c,d,e,50); nR3(e,a,b,c,d,51);
-    nR3(d,e,a,b,c,52); nR3(c,d,e,a,b,53); nR3(b,c,d,e,a,54); nR3(a,b,c,d,e,55);
-    nR3(e,a,b,c,d,56); nR3(d,e,a,b,c,57); nR3(c,d,e,a,b,58); nR3(b,c,d,e,a,59);
-}
-
-void
-do_R4(u_int32_t *a, u_int32_t *b, u_int32_t *c, u_int32_t *d, u_int32_t *e, CHAR64LONG16 *block)
-{
-    nR4(a,b,c,d,e,60); nR4(e,a,b,c,d,61); nR4(d,e,a,b,c,62); nR4(c,d,e,a,b,63);
-    nR4(b,c,d,e,a,64); nR4(a,b,c,d,e,65); nR4(e,a,b,c,d,66); nR4(d,e,a,b,c,67);
-    nR4(c,d,e,a,b,68); nR4(b,c,d,e,a,69); nR4(a,b,c,d,e,70); nR4(e,a,b,c,d,71);
-    nR4(d,e,a,b,c,72); nR4(c,d,e,a,b,73); nR4(b,c,d,e,a,74); nR4(a,b,c,d,e,75);
-    nR4(e,a,b,c,d,76); nR4(d,e,a,b,c,77); nR4(c,d,e,a,b,78); nR4(b,c,d,e,a,79);
-}
-#endif
-
 /*
  * Hash a single 512-bit block. This is the core of the algorithm.
  */
-void SHA1Transform(state, buffer)
-    u_int32_t state[5];
-    const u_char buffer[64];
+void SHA1Transform(uint32_t state[SHA1_DIGEST_LENGTH/4],
+                   const uint8_t buffer[SHA1_BLOCK_SIZE])
 {
-    u_int32_t a, b, c, d, e;
+    uint32_t a, b, c, d, e;
     CHAR64LONG16 *block;
 
 #ifdef SHA1HANDSOFF
@@ -136,7 +72,7 @@
 
 #ifdef SHA1HANDSOFF
     block = &workspace;
-    (void)memcpy(block, buffer, 64);
+    (void)memcpy(block, buffer, SHA1_BLOCK_SIZE);
 #else
     block = (CHAR64LONG16 *)(void *)buffer;
 #endif
@@ -148,12 +84,6 @@
     d = state[3];
     e = state[4];
 
-#ifdef SPARC64_GCC_WORKAROUND
-    do_R01(&a, &b, &c, &d, &e, block);
-    do_R2(&a, &b, &c, &d, &e, block);
-    do_R3(&a, &b, &c, &d, &e, block);
-    do_R4(&a, &b, &c, &d, &e, block);
-#else
     /* 4 rounds of 20 operations each. Loop unrolled. */
     R0(a,b,c,d,e, 0); R0(e,a,b,c,d, 1); R0(d,e,a,b,c, 2); R0(c,d,e,a,b, 3);
     R0(b,c,d,e,a, 4); R0(a,b,c,d,e, 5); R0(e,a,b,c,d, 6); R0(d,e,a,b,c, 7);
@@ -175,7 +105,6 @@
     R4(c,d,e,a,b,68); R4(b,c,d,e,a,69); R4(a,b,c,d,e,70); R4(e,a,b,c,d,71);
     R4(d,e,a,b,c,72); R4(c,d,e,a,b,73); R4(b,c,d,e,a,74); R4(a,b,c,d,e,75);
     R4(e,a,b,c,d,76); R4(d,e,a,b,c,77); R4(c,d,e,a,b,78); R4(b,c,d,e,a,79);
-#endif
 
     /* Add the working vars back into context.state[] */
     state[0] += a;
@@ -192,78 +121,91 @@
 /*
  * SHA1Init - Initialize new context
  */
-void SHA1Init(context)
-    SHA1_CTX *context;
+void SHA1Init(SHA1_CTX *context)
 {
-
     assert(context != 0);
 
     /* SHA1 initialization constants */
-    context->state[0] = 0x67452301;
-    context->state[1] = 0xEFCDAB89;
-    context->state[2] = 0x98BADCFE;
-    context->state[3] = 0x10325476;
-    context->state[4] = 0xC3D2E1F0;
-    context->count[0] = context->count[1] = 0;
+    *context = (SHA1_CTX) {
+        .state = {
+            0x67452301,
+            0xEFCDAB89,
+            0x98BADCFE,
+            0x10325476,
+            0xC3D2E1F0,
+        },
+        .count = 0,
+    };
 }
 
 
 /*
  * Run your data through this.
  */
-void SHA1Update(context, data, len)
-    SHA1_CTX *context;
-    const u_char *data;
-    u_int len;
+void SHA1Update(SHA1_CTX *context, const uint8_t *data, unsigned int len)
 {
-    u_int i, j;
+    unsigned int i, j;
+    unsigned int partial, done;
+    const uint8_t *src;
 
     assert(context != 0);
     assert(data != 0);
 
-    j = context->count[0];
-    if ((context->count[0] += len << 3) < j)
-	context->count[1] += (len>>29)+1;
-    j = (j >> 3) & 63;
-    if ((j + len) > 63) {
-	(void)memcpy(&context->buffer[j], data, (i = 64-j));
-	SHA1Transform(context->state, context->buffer);
-	for ( ; i + 63 < len; i += 64)
-	    SHA1Transform(context->state, &data[i]);
-	j = 0;
-    } else {
-	i = 0;
+    partial = context->count % SHA1_BLOCK_SIZE;
+    context->count += len;
+    done = 0;
+    src = data;
+
+    if ((partial + len) >= SHA1_BLOCK_SIZE) {
+        if (partial) {
+            done = -partial;
+            memcpy(context->buffer + partial, data, done + SHA1_BLOCK_SIZE);
+            src = context->buffer;
+        }
+        do {
+            SHA1Transform(context->state, src);
+            done += SHA1_BLOCK_SIZE;
+            src = data + done;
+        } while (done + SHA1_BLOCK_SIZE <= len);
+        partial = 0;
     }
-    (void)memcpy(&context->buffer[j], &data[i], len - i);
+    memcpy(context->buffer + partial, src, len - done);
 }
 
 
 /*
  * Add padding and return the message digest.
  */
-void SHA1Final(digest, context)
-    u_char digest[20];
-    SHA1_CTX* context;
+void SHA1Final(uint8_t digest[SHA1_DIGEST_LENGTH], SHA1_CTX *context)
 {
-    u_int i;
-    u_char finalcount[8];
+    uint32_t i, index, pad_len;
+    uint64_t bits;
+    static const uint8_t padding[SHA1_BLOCK_SIZE] = { 0x80, };
 
     assert(digest != 0);
     assert(context != 0);
 
-    for (i = 0; i < 8; i++) {
-	finalcount[i] = (u_char)((context->count[(i >= 4 ? 0 : 1)]
-	 >> ((3-(i & 3)) * 8) ) & 255);	 /* Endian independent */
-    }
-    SHA1Update(context, (const u_char *)"\200", 1);
-    while ((context->count[0] & 504) != 448)
-	SHA1Update(context, (const u_char *)"\0", 1);
-    SHA1Update(context, finalcount, 8);  /* Should cause a SHA1Transform() */
+#if BYTE_ORDER == LITTLE_ENDIAN
+    bits = swap64(context->count << 3);
+#else
+    bits = context->count << 3;
+#endif
+
+    /* Pad out to 56 mod 64 */
+    index = context->count & 0x3f;
+    pad_len = (index < 56) ? (56 - index) : ((64 + 56) - index);
+    SHA1Update(context, padding, pad_len);
+
+    /* Append length */
+    SHA1Update(context, (const uint8_t *)&bits, sizeof(bits));
 
     if (digest) {
-	for (i = 0; i < 20; i++)
-	    digest[i] = (u_char)
-		((context->state[i>>2] >> ((3-(i & 3)) * 8) ) & 255);
+        for (i = 0; i < SHA1_DIGEST_LENGTH/4; i++)
+#if BYTE_ORDER == LITTLE_ENDIAN
+            ((uint32_t *)digest)[i] = swap32(context->state[i]);
+#else
+            ((uint32_t *)digest)[i] = context->state[i];
+#endif
     }
 }
 
diff --git a/libc/include/errno.h b/libc/include/errno.h
index 2b2685a..cae0e3b 100644
--- a/libc/include/errno.h
+++ b/libc/include/errno.h
@@ -45,6 +45,7 @@
 extern int    __set_errno(int  error);
 
 /* internal function returning the address of the thread-specific errno */
+__attribute__((const))
 extern volatile int*   __errno(void);
 
 /* a macro expanding to the errno l-value */
diff --git a/libc/include/pthread.h b/libc/include/pthread.h
index 9d05769..56c48ea 100644
--- a/libc/include/pthread.h
+++ b/libc/include/pthread.h
@@ -142,6 +142,7 @@
 int pthread_join(pthread_t thid, void ** ret_val);
 int pthread_detach(pthread_t  thid);
 
+__attribute__((const))
 pthread_t pthread_self(void);
 int pthread_equal(pthread_t one, pthread_t two);
 
diff --git a/libc/include/resolv.h b/libc/include/resolv.h
index bb21c23..7257d34 100644
--- a/libc/include/resolv.h
+++ b/libc/include/resolv.h
@@ -40,7 +40,7 @@
 
 struct res_state;
 
-extern struct __res_state *__res_state(void);
+extern struct __res_state *__res_state(void) __attribute__((const));
 #define _res (*__res_state())
 
 /* Base-64 functions - because some code expects it there */
diff --git a/libc/include/sha1.h b/libc/include/sha1.h
index f7ada46..bc51ac0 100644
--- a/libc/include/sha1.h
+++ b/libc/include/sha1.h
@@ -13,19 +13,20 @@
 #include <sys/types.h>
 
 #define SHA1_DIGEST_LENGTH		20
-#define SHA1_DIGEST_STRING_LENGTH	41
+#define SHA1_BLOCK_SIZE 		64
 
 typedef struct {
-	uint32_t state[5];
-	uint32_t count[2];
-	u_char buffer[64];
+    uint64_t count;
+    uint32_t state[SHA1_DIGEST_LENGTH / 4];
+    uint8_t buffer[SHA1_BLOCK_SIZE];
 } SHA1_CTX;
 
 __BEGIN_DECLS
-void	SHA1Transform(uint32_t[5], const u_char[64]);
+void	SHA1Transform(uint32_t[SHA1_DIGEST_LENGTH/4],
+	              const uint8_t[SHA1_BLOCK_SIZE]);
 void	SHA1Init(SHA1_CTX *);
-void	SHA1Update(SHA1_CTX *, const u_char *, u_int);
-void	SHA1Final(u_char[SHA1_DIGEST_LENGTH], SHA1_CTX *);
+void	SHA1Update(SHA1_CTX *, const uint8_t *, unsigned int);
+void	SHA1Final(uint8_t[SHA1_DIGEST_LENGTH], SHA1_CTX *);
 __END_DECLS
 
 #endif /* _SYS_SHA1_H_ */
diff --git a/libc/netbsd/resolv/res_send.c b/libc/netbsd/resolv/res_send.c
index b118956..1a28d9e 100644
--- a/libc/netbsd/resolv/res_send.c
+++ b/libc/netbsd/resolv/res_send.c
@@ -413,7 +413,7 @@
 				if (EXT(statp).nssocks[ns] == -1)
 					continue;
 				peerlen = sizeof(peer);
-				if (getsockname(EXT(statp).nssocks[ns],
+				if (getpeername(EXT(statp).nssocks[ns],
 				    (struct sockaddr *)(void *)&peer, &peerlen) < 0) {
 					needclose++;
 					break;
diff --git a/libc/string/memmove.c b/libc/string/memmove.c
index 072104b..7c1e9b2 100644
--- a/libc/string/memmove.c
+++ b/libc/string/memmove.c
@@ -32,10 +32,10 @@
 {
   const char *p = src;
   char *q = dst;
-  /* We can use the optimized memcpy if the destination is below the
-   * source (i.e. q < p), or if it is completely over it (i.e. q >= p+n).
+  /* We can use the optimized memcpy if the destination is completely below the
+   * source (i.e. q+n <= p), or if it is completely over it (i.e. q >= p+n).
    */
-  if (__builtin_expect((q < p) || ((size_t)(q - p) >= n), 1)) {
+  if (__builtin_expect((q + n < p) || (q >= p + n), 1)) {
     return memcpy(dst, src, n);
   } else {
     bcopy(src, dst, n);
diff --git a/linker/linker.c b/linker/linker.c
index e0a8a18..503a192 100644
--- a/linker/linker.c
+++ b/linker/linker.c
@@ -428,9 +428,16 @@
     while(*name) {
         h = (h << 4) + *name++;
         g = h & 0xf0000000;
-        h ^= g;
+        /* The hash algorithm in the ELF ABI is as follows:
+         *   if (g != 0)
+         *       h ^=g >> 24;
+         *   h &= ~g;
+         * But we can use the equivalent and faster implementation:
+         */
         h ^= g >> 24;
     }
+    /* Lift the operation out of the inner loop */
+    h &= 0x0fffffff;
     return h;
 }