Reorganize static TLS memory for ELF TLS

For ELF TLS "local-exec" accesses, the static linker assumes that an
executable's TLS segment is located at a statically-known offset from the
thread pointer (i.e. "variant 1" for ARM and "variant 2" for x86).
Because these layouts are incompatible, Bionic generally needs to allocate
its TLS slots differently between different architectures.

To allow per-architecture TLS slots:
 - Replace the TLS_SLOT_xxx enumerators with macros. New ARM slots are
   generally negative, while new x86 slots are generally positive.
 - Define a bionic_tcb struct that provides two things:
    - a void* raw_slots_storage[BIONIC_TLS_SLOTS] field
    - an inline accessor function: void*& tls_slot(size_t tpindex);

For ELF TLS, it's necessary to allocate a temporary TCB (i.e. TLS slots),
because the runtime linker doesn't know how large the static TLS area is
until after it has loaded all of the initial solibs.

To accommodate Golang, it's necessary to allocate the pthread keys at a
fixed, small, positive offset from the thread pointer.

This CL moves the pthread keys into bionic_tls, then allocates a single
mapping per thread that looks like so:
 - stack guard
 - stack [omitted for main thread and with pthread_attr_setstack]
 - static TLS:
    - bionic_tcb [exec TLS will either precede or succeed the TCB]
    - bionic_tls [prefixed by the pthread keys]
    - [solib TLS segments will be placed here]
 - guard page

As before, if the new mapping includes a stack, the pthread_internal_t
is allocated on it.

At startup, Bionic allocates a temporary bionic_tcb object on the stack,
then allocates a temporary bionic_tls object using mmap. This mmap is
delayed because the linker can't currently call async_safe_fatal() before
relocating itself.

Later, Bionic allocates a stack-less thread mapping for the main thread,
and copies slots from the temporary TCB to the new TCB.
(See *::copy_from_bootstrap methods.)

Bug: http://b/78026329
Test: bionic unit tests
Test: verify that a Golang app still works
Test: verify that a Golang app crashes if bionic_{tls,tcb} are swapped
Merged-In: I6543063752f4ec8ef6dc9c7f2a06ce2a18fc5af3
Change-Id: I6543063752f4ec8ef6dc9c7f2a06ce2a18fc5af3
(cherry picked from commit 1e660b70da625fcbf1e43dfae09b7b4817fa1660)
diff --git a/libc/private/bionic_tls.h b/libc/private/bionic_tls.h
index 4749cee..90914c3 100644
--- a/libc/private/bionic_tls.h
+++ b/libc/private/bionic_tls.h
@@ -26,8 +26,7 @@
  * SUCH DAMAGE.
  */
 
-#ifndef __BIONIC_PRIVATE_BIONIC_TLS_H_
-#define __BIONIC_PRIVATE_BIONIC_TLS_H_
+#pragma once
 
 #include <locale.h>
 #include <mntent.h>
@@ -35,73 +34,48 @@
 #include <sys/cdefs.h>
 #include <sys/param.h>
 
+#include "bionic_asm_tls.h"
 #include "bionic_macros.h"
 #include "__get_tls.h"
 #include "grp_pwd.h"
 
-__BEGIN_DECLS
-
 /** WARNING WARNING WARNING
  **
- ** This header file is *NOT* part of the public Bionic ABI/API
- ** and should not be used/included by user-serviceable parts of
- ** the system (e.g. applications).
- **
- ** It is only provided here for the benefit of the system dynamic
- ** linker and the OpenGL sub-system (which needs to access the
- ** pre-allocated slot directly for performance reason).
+ ** This header file is *NOT* part of the public Bionic ABI/API and should not
+ ** be used/included by user-serviceable parts of the system (e.g.
+ ** applications).
  **/
 
-// Well-known TLS slots. What data goes in which slot is arbitrary unless otherwise noted.
-enum {
-  TLS_SLOT_SELF = 0, // The kernel requires this specific slot for x86.
-  TLS_SLOT_THREAD_ID,
+class pthread_internal_t;
 
-  // TLS slot 2 was used for errno but is now free.
+// This struct is small, so the linker can allocate a temporary copy on its
+// stack. It can't be combined with pthread_internal_t because:
+//  - native bridge requires pthread_internal_t to have the same layout across
+//    architectures, and
+//  - On x86, this struct would have to be placed at the front of
+//    pthread_internal_t, moving fields like `tid`.
+//  - We'd like to avoid having a temporary pthread_internal_t object that
+//    needs to be transferred once the final size of static TLS is known.
+struct bionic_tcb {
+  void* raw_slots_storage[BIONIC_TLS_SLOTS];
 
-  // These two aren't used by bionic itself, but allow the graphics code to
-  // access TLS directly rather than using the pthread API.
-  TLS_SLOT_OPENGL_API = 3,
-  TLS_SLOT_OPENGL = 4,
+  // Return a reference to a slot given its TP-relative TLS_SLOT_xxx index.
+  // The thread pointer (i.e. __get_tls()) points at &tls_slot(0).
+  void*& tls_slot(size_t tpindex) {
+    return raw_slots_storage[tpindex - MIN_TLS_SLOT];
+  }
 
-  TLS_SLOT_STACK_GUARD = 5, // GCC requires this specific slot for x86.
+  // Initialize the main thread's final object using its bootstrap object.
+  void copy_from_bootstrap(const bionic_tcb* boot) {
+    // Copy everything. Problematic slots will be reinitialized.
+    *this = *boot;
+  }
 
-  // Lets sanitizers avoid using pthread_getspecific for finding the current
-  // thread state. (Slot 6 was historically used for dlerror instead.)
-  TLS_SLOT_SANITIZER = 6,
-
-  // Fast storage for Thread::Current() in ART.
-  TLS_SLOT_ART_THREAD_SELF = 7,
-
-  // Lets TSAN avoid using pthread_getspecific for finding the current thread
-  // state.
-  TLS_SLOT_TSAN = 8,
-
-  BIONIC_TLS_SLOTS // Must come last!
+  pthread_internal_t* thread() {
+    return static_cast<pthread_internal_t*>(tls_slot(TLS_SLOT_THREAD_ID));
+  }
 };
 
-// ~3 pages.
-struct bionic_tls {
-  locale_t locale;
-
-  char basename_buf[MAXPATHLEN];
-  char dirname_buf[MAXPATHLEN];
-
-  mntent mntent_buf;
-  char mntent_strings[BUFSIZ];
-
-  char ptsname_buf[32];
-  char ttyname_buf[64];
-
-  char strerror_buf[NL_TEXTMAX];
-  char strsignal_buf[NL_TEXTMAX];
-
-  group_state_t group;
-  passwd_state_t passwd;
-};
-
-#define BIONIC_TLS_SIZE (__BIONIC_ALIGN(sizeof(bionic_tls), PAGE_SIZE))
-
 /*
  * Bionic uses some pthread keys internally. All pthread keys used internally
  * should be created in constructors, except for keys that may be used in or
@@ -126,12 +100,42 @@
  */
 #define BIONIC_PTHREAD_KEY_COUNT (BIONIC_PTHREAD_KEY_RESERVED_COUNT + PTHREAD_KEYS_MAX)
 
-__END_DECLS
+class pthread_key_data_t {
+ public:
+  uintptr_t seq; // Use uintptr_t just for alignment, as we use pointer below.
+  void* data;
+};
 
-#if defined(__cplusplus)
+// ~3 pages. This struct is allocated as static TLS memory (i.e. at a fixed
+// offset from the thread pointer).
+struct bionic_tls {
+  pthread_key_data_t key_data[BIONIC_PTHREAD_KEY_COUNT];
+
+  locale_t locale;
+
+  char basename_buf[MAXPATHLEN];
+  char dirname_buf[MAXPATHLEN];
+
+  mntent mntent_buf;
+  char mntent_strings[BUFSIZ];
+
+  char ptsname_buf[32];
+  char ttyname_buf[64];
+
+  char strerror_buf[NL_TEXTMAX];
+  char strsignal_buf[NL_TEXTMAX];
+
+  group_state_t group;
+  passwd_state_t passwd;
+
+  // Initialize the main thread's final object using its bootstrap object.
+  void copy_from_bootstrap(const bionic_tls* boot __attribute__((unused))) {
+    // Nothing in bionic_tls needs to be preserved in the transition to the
+    // final TLS objects, so don't copy anything.
+  }
+};
+
 class KernelArgumentBlock;
-extern void __libc_init_main_thread_early(KernelArgumentBlock& args);
-extern void __libc_init_main_thread_late();
-#endif
-
-#endif /* __BIONIC_PRIVATE_BIONIC_TLS_H_ */
+extern "C" void __libc_init_main_thread_early(const KernelArgumentBlock& args, bionic_tcb* temp_tcb);
+extern "C" void __libc_init_main_thread_late();
+extern "C" void __libc_init_main_thread_final();