complus/utf8.c

   1 /*
   2  * UTF-8 support routines
   3  *
   4  * Copyright 2000 Alexandre Julliard
   5  *
   6  * Taken from WINE, so the usual WINE copyright applies:
   7 Permission is hereby granted, free of charge, to any person obtaining a copy
   8 of this software and associated documentation files (the "Software"), to deal
   9 in the Software without restriction, including without limitation the rights
  10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11 copies of the Software, and to permit persons to whom the Software is
  12 furnished to do so, subject to the following conditions:
  13
  14 The above copyright notice and this permission notice shall be included in
  15 all copies or substantial portions of the Software.
  16
  17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
  20 COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
  21 IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 #include <stdio.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <windows.h>
  29
  30 #include <ole2.h>
  31
  32 /* number of following bytes in sequence based on first byte value (for bytes above 0x7f) */
  33 static const char utf8_length[128] =
  34 {
  35     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x80-0x8f */
  36     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x90-0x9f */
  37     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xa0-0xaf */
  38     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xb0-0xbf */
  39     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xc0-0xcf */
  40     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xd0-0xdf */
  41     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xe0-0xef */
  42     3,3,3,3,3,3,3,3,4,4,4,4,5,5,0,0  /* 0xf0-0xff */
  43 };
  44
  45 /* first byte mask depending on UTF-8 sequence length */
  46 static const unsigned char utf8_mask[6] = { 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
  47
  48 /* minimum Unicode value depending on UTF-8 sequence length */
  49 static const unsigned int utf8_minval[6] = { 0x0, 0x80, 0x800, 0x10000, 0x200000, 0x4000000 };
  50
  51
  52 /* query necessary dst length for src string */
  53 inline static int get_length_wcs_utf8( const WCHAR *src, unsigned int srclen )
  54 {
  55     int len;
  56
  57     for (len = 0; srclen; srclen--, src++, len++)
  58     {
  59         if (*src >= 0x80)
  60         {
  61             len++;
  62             if (*src >= 0x800) len++;
  63         }
  64     }
  65     return len;
  66 }
  67
  68 /* wide char to UTF-8 string conversion */
  69 /* return -1 on dst buffer overflow */
  70 int utf8_wcstombs( const WCHAR *src, int srclen, char *dst, int dstlen )
  71 {
  72     char *orig_dst = dst;
  73
  74     if (!dstlen) return get_length_wcs_utf8( src, srclen );
  75
  76     for (; srclen; srclen--, src++)
  77     {
  78         WCHAR ch = *src;
  79
  80         if (ch < 0x80)  /* 0x00-0x7f: 1 byte */
  81         {
  82             if (!dstlen--) return -1;  /* overflow */
  83             *dst++ = ch;
  84             continue;
  85         }
  86
  87         if (ch < 0x800)  /* 0x80-0x7ff: 2 bytes */
  88         {
  89             if ((dstlen -= 2) < 0) return -1;  /* overflow */
  90             dst[1] = 0x80 | (ch & 0x3f);
  91             ch >>= 6;
  92             dst[0] = 0xc0 | ch;
  93             dst += 2;
  94             continue;
  95         }
  96
  97         /* 0x800-0xffff: 3 bytes */
  98
  99         if ((dstlen -= 3) < 0) return -1;  /* overflow */
 100         dst[2] = 0x80 | (ch & 0x3f);
 101         ch >>= 6;
 102         dst[1] = 0x80 | (ch & 0x3f);
 103         ch >>= 6;
 104         dst[0] = 0xe0 | ch;
 105         dst += 3;
 106     }
 107     return dst - orig_dst;
 108 }
 109
 110 /* query necessary dst length for src string */
 111 inline static int get_length_mbs_utf8( const unsigned char *src, int srclen )
 112 {
 113     int ret;
 114     const unsigned char *srcend = src + srclen;
 115
 116     for (ret = 0; src < srcend; ret++)
 117     {
 118         unsigned char ch = *src++;
 119         if (ch < 0xc0) continue;
 120
 121         switch(utf8_length[ch-0x80])
 122         {
 123         case 5:
 124             if (src >= srcend) return ret;  /* ignore partial char */
 125             if ((ch = *src ^ 0x80) >= 0x40) continue;
 126             src++;
 127         case 4:
 128             if (src >= srcend) return ret;  /* ignore partial char */
 129             if ((ch = *src ^ 0x80) >= 0x40) continue;
 130             src++;
 131         case 3:
 132             if (src >= srcend) return ret;  /* ignore partial char */
 133             if ((ch = *src ^ 0x80) >= 0x40) continue;
 134             src++;
 135         case 2:
 136             if (src >= srcend) return ret;  /* ignore partial char */
 137             if ((ch = *src ^ 0x80) >= 0x40) continue;
 138             src++;
 139         case 1:
 140             if (src >= srcend) return ret;  /* ignore partial char */
 141             if ((ch = *src ^ 0x80) >= 0x40) continue;
 142             src++;
 143         }
 144     }
 145     return ret;
 146 }
 147
 148 /* UTF-8 to wide char string conversion */
 149 /* return -1 on dst buffer overflow, -2 on invalid input char */
 150 int utf8_mbstowcs( int flags, const char *src, int srclen, WCHAR *dst, int dstlen )
 151 {
 152     int len, count;
 153     unsigned int res;
 154     const char *srcend = src + srclen;
 155
 156     if (!dstlen) return get_length_mbs_utf8( src, srclen );
 157
 158     for (count = dstlen; count && (src < srcend); count--, dst++)
 159     {
 160         unsigned char ch = *src++;
 161         if (ch < 0x80)  /* special fast case for 7-bit ASCII */
 162         {
 163             *dst = ch;
 164             continue;
 165         }
 166         len = utf8_length[ch-0x80];
 167         res = ch & utf8_mask[len];
 168
 169         switch(len)
 170         {
 171         case 5:
 172             if (src >= srcend) goto done;  /* ignore partial char */
 173             if ((ch = *src ^ 0x80) >= 0x40) goto bad;
 174             res = (res << 6) | ch;
 175             src++;
 176         case 4:
 177             if (src >= srcend) goto done;  /* ignore partial char */
 178             if ((ch = *src ^ 0x80) >= 0x40) goto bad;
 179             res = (res << 6) | ch;
 180             src++;
 181         case 3:
 182             if (src >= srcend) goto done;  /* ignore partial char */
 183             if ((ch = *src ^ 0x80) >= 0x40) goto bad;
 184             res = (res << 6) | ch;
 185             src++;
 186         case 2:
 187             if (src >= srcend) goto done;  /* ignore partial char */
 188             if ((ch = *src ^ 0x80) >= 0x40) goto bad;
 189             res = (res << 6) | ch;
 190             src++;
 191         case 1:
 192             if (src >= srcend) goto done;  /* ignore partial char */
 193             if ((ch = *src ^ 0x80) >= 0x40) goto bad;
 194             res = (res << 6) | ch;
 195             src++;
 196             if (res < utf8_minval[len]) goto bad;
 197             if (res >= 0x10000) goto bad;  /* FIXME: maybe we should do surrogates here */
 198             *dst = res;
 199             continue;
 200         }
 201     bad:
 202         if (flags & MB_ERR_INVALID_CHARS) return -2;  /* bad char */
 203         *dst = (WCHAR)'?';
 204     }
 205     if (src < srcend) return -1;  /* overflow */
 206 done:
 207     return dstlen - count;
 208 }
 209
 210
 211 int
 212 bstrtoutf8 ( BSTR src, char *dst, size_t dstlen )
 213 {
 214     size_t srclen, needed;
 215     int n;
 216
 217     srclen = src? SysStringLen (src): 0;
 218
 219     needed = srclen? (utf8_wcstombs (src, srclen, NULL, 0) + 1) : 1;
 220     if (!dst || !dstlen)
 221         return needed;
 222     if (dstlen < needed)
 223         return -1;
 224     if (srclen) {
 225         n = utf8_wcstombs (src, srclen, dst, dstlen);
 226         if (n < 0)
 227             return -1;
 228     }
 229     else
 230         n = 0;
 231     dst[n] = 0;
 232     return n;
 233 }
 234
 235
 236