Added _memmove_words
Added a memmove() variant for Dalvik's System.arraycopy()
implementation. It guarantees 16-bit or 32-bit atomicity depending
on the alignment of the arguments.
Bug 3398352
Change-Id: Ie7bd246305ef0ff8290513663327c5b81680368d
diff --git a/libc/bionic/memmove_words.c b/libc/bionic/memmove_words.c
new file mode 100644
index 0000000..22058bc
--- /dev/null
+++ b/libc/bionic/memmove_words.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdlib.h>
+#include <assert.h>
+
+/*
+ * Works like memmove(), except:
+ * - if all arguments are at least 32-bit aligned, we guarantee that we
+ * will use operations that preserve atomicity of 32-bit values
+ * - if not, we guarantee atomicity of 16-bit values
+ *
+ * If all three arguments are not at least 16-bit aligned, the behavior
+ * of this function is undefined. (We could remove this restriction by
+ * testing for unaligned values and punting to memmove(), but that's
+ * not currently useful.)
+ *
+ * TODO: add loop for 64-bit alignment
+ * TODO: use __builtin_prefetch
+ * TODO: write an ARM-optimized version
+ */
+void _memmove_words(void* dest, const void* src, size_t n)
+{
+ assert((((uintptr_t) dest | (uintptr_t) src | n) & 0x01) == 0);
+
+ char* d = (char*) dest;
+ const char* s = (const char*) src;
+ size_t copyCount;
+
+ /*
+ * If the source and destination pointers are the same, this is
+ * an expensive no-op. Testing for an empty move now allows us
+ * to skip a check later.
+ */
+ if (n == 0 || d == s)
+ return;
+
+ /*
+ * Determine if the source and destination buffers will overlap if
+ * we copy data forward (i.e. *dest++ = *src++).
+ *
+ * It's okay if the destination buffer starts before the source and
+ * there is some overlap, because the reader is always ahead of the
+ * writer.
+ */
+ if (__builtin_expect((d < s) || ((size_t)(d - s) >= n), 1)) {
+ /*
+ * Copy forward. We prefer 32-bit loads and stores even for 16-bit
+ * data, so sort that out.
+ */
+ if ((((uintptr_t) d | (uintptr_t) s) & 0x03) != 0) {
+ /*
+ * Not 32-bit aligned. Two possibilities:
+ * (1) Congruent, we can align to 32-bit by copying one 16-bit val
+ * (2) Non-congruent, we can do one of:
+ * a. copy whole buffer as a series of 16-bit values
+ * b. load/store 32 bits, using shifts to ensure alignment
+ * c. just copy the as 32-bit values and assume the CPU
+ * will do a reasonable job
+ *
+ * We're currently using (a), which is suboptimal.
+ */
+ if ((((uintptr_t) d ^ (uintptr_t) s) & 0x03) != 0) {
+ copyCount = n;
+ } else {
+ copyCount = 2;
+ }
+ n -= copyCount;
+ copyCount /= sizeof(uint16_t);
+
+ while (copyCount--) {
+ *(uint16_t*)d = *(uint16_t*)s;
+ d += sizeof(uint16_t);
+ s += sizeof(uint16_t);
+ }
+ }
+
+ /*
+ * Copy 32-bit aligned words.
+ */
+ copyCount = n / sizeof(uint32_t);
+ while (copyCount--) {
+ *(uint32_t*)d = *(uint32_t*)s;
+ d += sizeof(uint32_t);
+ s += sizeof(uint32_t);
+ }
+
+ /*
+ * Check for leftovers. Either we finished exactly, or we have
+ * one remaining 16-bit chunk.
+ */
+ if ((n & 0x02) != 0) {
+ *(uint16_t*)d = *(uint16_t*)s;
+ }
+ } else {
+ /*
+ * Copy backward, starting at the end.
+ */
+ d += n;
+ s += n;
+
+ if ((((uintptr_t) d | (uintptr_t) s) & 0x03) != 0) {
+ /* try for 32-bit alignment */
+ if ((((uintptr_t) d ^ (uintptr_t) s) & 0x03) != 0) {
+ copyCount = n;
+ } else {
+ copyCount = 2;
+ }
+ n -= copyCount;
+ copyCount /= sizeof(uint16_t);
+
+ while (copyCount--) {
+ d -= sizeof(uint16_t);
+ s -= sizeof(uint16_t);
+ *(uint16_t*)d = *(uint16_t*)s;
+ }
+ }
+
+ /* copy 32-bit aligned words */
+ copyCount = n / sizeof(uint32_t);
+ while (copyCount--) {
+ d -= sizeof(uint32_t);
+ s -= sizeof(uint32_t);
+ *(uint32_t*)d = *(uint32_t*)s;
+ }
+
+ /* copy leftovers */
+ if ((n & 0x02) != 0) {
+ d -= sizeof(uint16_t);
+ s -= sizeof(uint16_t);
+ *(uint16_t*)d = *(uint16_t*)s;
+ }
+ }
+}