| /* | 
 |  * Copyright (C) 2017 The Android Open Source Project | 
 |  * All rights reserved. | 
 |  * | 
 |  * Redistribution and use in source and binary forms, with or without | 
 |  * modification, are permitted provided that the following conditions | 
 |  * are met: | 
 |  *  * Redistributions of source code must retain the above copyright | 
 |  *    notice, this list of conditions and the following disclaimer. | 
 |  *  * Redistributions in binary form must reproduce the above copyright | 
 |  *    notice, this list of conditions and the following disclaimer in | 
 |  *    the documentation and/or other materials provided with the | 
 |  *    distribution. | 
 |  * | 
 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | 
 |  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | 
 |  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS | 
 |  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE | 
 |  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, | 
 |  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, | 
 |  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS | 
 |  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED | 
 |  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | 
 |  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT | 
 |  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | 
 |  * SUCH DAMAGE. | 
 |  */ | 
 |  | 
 | #include <iconv.h> | 
 |  | 
 | #include <ctype.h> | 
 | #include <endian.h> | 
 | #include <errno.h> | 
 | #include <stdlib.h> | 
 | #include <string.h> | 
 | #include <uchar.h> | 
 |  | 
 | #include "private/bionic_mbstate.h" | 
 |  | 
 | #define INVALID_ICONV_T reinterpret_cast<iconv_t>(-1) | 
 |  | 
 | // Ideally we'd use icu4c but the API mismatch seems too great. So we just offer something | 
 | // equivalent to (but slightly easier to use for runs of text than) <uchar.h>. If you're | 
 | // here to add more encodings, consider working on finishing the icu4c NDK wrappers instead. | 
 | enum Encoding { | 
 |   US_ASCII, | 
 |   UTF_8, | 
 |   UTF_16_LE, | 
 |   UTF_16_BE, | 
 |   UTF_32_LE, | 
 |   UTF_32_BE, | 
 |   WCHAR_T, | 
 | }; | 
 |  | 
 | enum Mode { | 
 |   ERROR, | 
 |   IGNORE, | 
 |   TRANSLIT, | 
 | }; | 
 |  | 
 | // This matching is strange but true. | 
 | // See http://www.unicode.org/reports/tr22/#Charset_Alias_Matching. | 
 | static bool __match_encoding(const char* lhs, const char* rhs) { | 
 |   while (*lhs && *rhs) { | 
 |     // Skip non-alnum in lhs; "UTF-8", "UTF_8", "UTF8", "UTF 8" are all equivalent. | 
 |     // Also implement the "delete each 0 that is not preceded by a digit" rule. | 
 |     for (; *lhs; ++lhs) { | 
 |       if (isalnum(*lhs) && (*lhs != '0' || !isdigit(*(lhs + 1)))) break; | 
 |     } | 
 |     // Case doesn't matter either. | 
 |     if (tolower(*lhs) != tolower(*rhs)) break; | 
 |     ++lhs; | 
 |     ++rhs; | 
 |   } | 
 |   // As a special case we treat the GNU "//" extensions as end of string. | 
 |   if ((*lhs == '\0' || strstr(lhs, "//") == lhs) && *rhs == '\0') return true; | 
 |   return false; | 
 | } | 
 |  | 
 | static bool __parse_encoding(const char* s, Encoding* encoding, Mode* mode) { | 
 |   const char* suffix = strstr(s, "//"); | 
 |   if (suffix) { | 
 |     if (!mode) return false; | 
 |     if (strcmp(suffix, "//IGNORE") == 0) { | 
 |       *mode = IGNORE; | 
 |     } else if (strcmp(suffix, "//TRANSLIT") == 0) { | 
 |       *mode = TRANSLIT; | 
 |     } else { | 
 |       return false; | 
 |     } | 
 |   } | 
 |   if (__match_encoding(s, "utf8")) { | 
 |     *encoding = UTF_8; | 
 |   } else if (__match_encoding(s, "ascii") || __match_encoding(s, "usascii")) { | 
 |     *encoding = US_ASCII; | 
 |   } else if (__match_encoding(s, "utf16le")) { | 
 |     *encoding = UTF_16_LE; | 
 |   } else if (__match_encoding(s, "utf16be")) { | 
 |     *encoding = UTF_16_BE; | 
 |   } else if (__match_encoding(s, "utf32le")) { | 
 |     *encoding = UTF_32_LE; | 
 |   } else if (__match_encoding(s, "utf32be")) { | 
 |     *encoding = UTF_32_BE; | 
 |   } else if (__match_encoding(s, "wchart")) { | 
 |     *encoding = WCHAR_T; | 
 |   } else { | 
 |     return false; | 
 |   } | 
 |   return true; | 
 | } | 
 |  | 
 | struct __iconv_t { | 
 |   Encoding src_encoding; | 
 |   Encoding dst_encoding; | 
 |   Mode mode; | 
 |  | 
 |   __iconv_t() : mode(ERROR) { | 
 |   } | 
 |  | 
 |   int Convert(char** src_buf0, size_t* src_bytes_left0, char** dst_buf0, size_t* dst_bytes_left0) { | 
 |     // Reset state. | 
 |     wc = 0; | 
 |     memset(&ps, 0, sizeof(ps)); | 
 |     replacement_count = 0; | 
 |     ignored = false; | 
 |     src_buf = src_buf0; | 
 |     src_bytes_left = src_bytes_left0; | 
 |     dst_buf = dst_buf0; | 
 |     dst_bytes_left = dst_bytes_left0; | 
 |  | 
 |     while (*src_bytes_left > 0) { | 
 |       if (!GetNext() || !Convert()) return -1; | 
 |     } | 
 |     return Done(); | 
 |   } | 
 |  | 
 |  private: | 
 |   char32_t wc; | 
 |   char buf[16]; | 
 |   size_t src_bytes_used; | 
 |   size_t dst_bytes_used; | 
 |   mbstate_t ps; | 
 |  | 
 |   size_t replacement_count; | 
 |   bool ignored; | 
 |  | 
 |   char** src_buf; | 
 |   size_t* src_bytes_left; | 
 |   char** dst_buf; | 
 |   size_t* dst_bytes_left; | 
 |  | 
 |   bool GetNext() { | 
 |     errno = 0; | 
 |     switch (src_encoding) { | 
 |       case US_ASCII: | 
 |         wc = **src_buf; | 
 |         src_bytes_used = 1; | 
 |         if (wc > 0x7f) errno = EILSEQ; | 
 |         break; | 
 |  | 
 |       case UTF_8: | 
 |         src_bytes_used = mbrtoc32(&wc, *src_buf, *src_bytes_left, &ps); | 
 |         if (src_bytes_used == __MB_ERR_ILLEGAL_SEQUENCE) { | 
 |           break;  // EILSEQ already set. | 
 |         } else if (src_bytes_used == __MB_ERR_INCOMPLETE_SEQUENCE) { | 
 |           errno = EINVAL; | 
 |           return false; | 
 |         } | 
 |         break; | 
 |  | 
 |       case UTF_16_BE: | 
 |       case UTF_16_LE: { | 
 |         if (*src_bytes_left < 2) { | 
 |           errno = EINVAL; | 
 |           return false; | 
 |         } | 
 |         bool swap = (src_encoding == UTF_16_BE); | 
 |         wc = In16(*src_buf, swap); | 
 |         // 0xd800-0xdbff: high surrogates | 
 |         // 0xdc00-0xdfff: low surrogates | 
 |         if (wc >= 0xd800 && wc <= 0xdfff) { | 
 |           if (wc >= 0xdc00) {  // Low surrogate before high surrogate. | 
 |             errno = EILSEQ; | 
 |             return false; | 
 |           } | 
 |           if (*src_bytes_left < 4) { | 
 |             errno = EINVAL; | 
 |             return false; | 
 |           } | 
 |           uint16_t hi = wc; | 
 |           uint16_t lo = In16(*src_buf + 2, swap); | 
 |           wc = 0x10000 + ((hi - 0xd800) << 10) + (lo - 0xdc00); | 
 |           src_bytes_used = 4; | 
 |         } | 
 |         break; | 
 |       } | 
 |  | 
 |       case UTF_32_BE: | 
 |       case UTF_32_LE: | 
 |       case WCHAR_T: | 
 |         if (*src_bytes_left < 4) { | 
 |           errno = EINVAL; | 
 |           return false; | 
 |         } | 
 |         wc = In32(*src_buf, (src_encoding == UTF_32_BE)); | 
 |         break; | 
 |     } | 
 |  | 
 |     if (errno == EILSEQ) { | 
 |       switch (mode) { | 
 |         case ERROR: | 
 |           return false; | 
 |         case IGNORE: | 
 |           *src_buf += src_bytes_used; | 
 |           *src_bytes_left -= src_bytes_used; | 
 |           ignored = true; | 
 |           return GetNext(); | 
 |         case TRANSLIT: | 
 |           wc = '?'; | 
 |           ++replacement_count; | 
 |           return true; | 
 |       } | 
 |     } | 
 |     return true; | 
 |   } | 
 |  | 
 |   bool Convert() { | 
 |     errno = 0; | 
 |     switch (dst_encoding) { | 
 |       case US_ASCII: | 
 |         buf[0] = wc; | 
 |         dst_bytes_used = 1; | 
 |         if (wc > 0x7f) errno = EILSEQ; | 
 |         break; | 
 |  | 
 |       case UTF_8: | 
 |         dst_bytes_used = c32rtomb(buf, wc, &ps); | 
 |         if (dst_bytes_used == __MB_ERR_ILLEGAL_SEQUENCE) { | 
 |           break;  // EILSEQ already set. | 
 |         } else if (dst_bytes_used == __MB_ERR_INCOMPLETE_SEQUENCE) { | 
 |           errno = EINVAL; | 
 |           return false; | 
 |         } | 
 |         break; | 
 |  | 
 |       case UTF_16_BE: | 
 |       case UTF_16_LE: { | 
 |         bool swap = (dst_encoding == UTF_16_BE); | 
 |         if (wc < 0x10000) {  // BMP. | 
 |           Out16(buf, wc, swap); | 
 |         } else {  // Supplementary plane; output surrogate pair. | 
 |           wc -= 0x10000; | 
 |           char16_t hi = 0xd800 | (wc >> 10); | 
 |           char16_t lo = 0xdc00 | (wc & 0x3ff); | 
 |           Out16(buf + 0, hi, swap); | 
 |           Out16(buf + 2, lo, swap); | 
 |           dst_bytes_used = 4; | 
 |         } | 
 |       } break; | 
 |  | 
 |       case UTF_32_BE: | 
 |       case UTF_32_LE: | 
 |       case WCHAR_T: | 
 |         Out32(wc, (dst_encoding == UTF_32_BE)); | 
 |         break; | 
 |     } | 
 |  | 
 |     if (errno == EILSEQ) { | 
 |       if (mode == IGNORE) { | 
 |         *src_buf += src_bytes_used; | 
 |         *src_bytes_left -= src_bytes_used; | 
 |         ignored = true; | 
 |         return true; | 
 |       } else if (mode == TRANSLIT) { | 
 |         wc = '?'; | 
 |         ++replacement_count; | 
 |         return Convert(); | 
 |       } | 
 |       return false; | 
 |     } | 
 |  | 
 |     return Emit(); | 
 |   } | 
 |  | 
 |   uint16_t In16(const char* buf, bool swap) { | 
 |     const uint8_t* src = reinterpret_cast<const uint8_t*>(buf); | 
 |     uint16_t wc = (src[0]) | (src[1] << 8); | 
 |     if (swap) wc = __swap16(wc); | 
 |     src_bytes_used = 2; | 
 |     return wc; | 
 |   } | 
 |  | 
 |   uint32_t In32(const char* buf, bool swap) { | 
 |     const uint8_t* src = reinterpret_cast<const uint8_t*>(buf); | 
 |     uint32_t wc = (src[0]) | (src[1] << 8) | (src[2] << 16) | (src[3] << 24); | 
 |     if (swap) wc = __swap32(wc); | 
 |     src_bytes_used = 4; | 
 |     return wc; | 
 |   } | 
 |  | 
 |   void Out16(char* dst, char16_t ch, bool swap) { | 
 |     if (swap) ch = __swap16(ch); | 
 |     dst[0] = ch; | 
 |     dst[1] = ch >> 8; | 
 |     dst_bytes_used = 2; | 
 |   } | 
 |  | 
 |   void Out32(char32_t ch, bool swap) { | 
 |     if (swap) ch = __swap32(ch); | 
 |     buf[0] = ch; | 
 |     buf[1] = ch >> 8; | 
 |     buf[2] = ch >> 16; | 
 |     buf[3] = ch >> 24; | 
 |     dst_bytes_used = 4; | 
 |   } | 
 |  | 
 |   bool Emit() { | 
 |     if (dst_bytes_used > *dst_bytes_left) { | 
 |       errno = E2BIG; | 
 |       return false; | 
 |     } | 
 |  | 
 |     memcpy(*dst_buf, buf, dst_bytes_used); | 
 |     *src_buf += src_bytes_used; | 
 |     *src_bytes_left -= src_bytes_used; | 
 |     *dst_buf += dst_bytes_used; | 
 |     *dst_bytes_left -= dst_bytes_used; | 
 |     return true; | 
 |   } | 
 |  | 
 |   int Done() { | 
 |     if (mode == TRANSLIT) return replacement_count; | 
 |     if (ignored) { | 
 |       errno = EILSEQ; | 
 |       return -1; | 
 |     } | 
 |     return 0; | 
 |   } | 
 | }; | 
 |  | 
 | iconv_t iconv_open(const char* __dst_encoding, const char* __src_encoding) { | 
 |   iconv_t result = new __iconv_t; | 
 |   if (!__parse_encoding(__src_encoding, &result->src_encoding, nullptr) || | 
 |       !__parse_encoding(__dst_encoding, &result->dst_encoding, &result->mode)) { | 
 |     delete result; | 
 |     errno = EINVAL; | 
 |     return INVALID_ICONV_T; | 
 |   } | 
 |   return result; | 
 | } | 
 |  | 
 | size_t iconv(iconv_t __converter, | 
 |              char** __src_buf, size_t* __src_bytes_left, | 
 |              char** __dst_buf, size_t* __dst_bytes_left) { | 
 |   if (__converter == INVALID_ICONV_T) { | 
 |     errno = EBADF; | 
 |     return -1; | 
 |   } | 
 |   return __converter->Convert(__src_buf, __src_bytes_left, __dst_buf, __dst_bytes_left); | 
 | } | 
 |  | 
 | int iconv_close(iconv_t __converter) { | 
 |   if (__converter == INVALID_ICONV_T) { | 
 |     errno = EBADF; | 
 |     return -1; | 
 |   } | 
 |   delete __converter; | 
 |   return 0; | 
 | } |