2 * UTF-8 support routines
4 * Copyright 2000 Alexandre Julliard
6 * Taken from WINE, so the usual WINE copyright applies:
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
21 IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
32 /* number of following bytes in sequence based on first byte value (for bytes above 0x7f) */
33 static const char utf8_length[128] =
35 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x80-0x8f */
36 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x90-0x9f */
37 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xa0-0xaf */
38 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xb0-0xbf */
39 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xc0-0xcf */
40 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xd0-0xdf */
41 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xe0-0xef */
42 3,3,3,3,3,3,3,3,4,4,4,4,5,5,0,0 /* 0xf0-0xff */
45 /* first byte mask depending on UTF-8 sequence length */
46 static const unsigned char utf8_mask[6] = { 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
48 /* minimum Unicode value depending on UTF-8 sequence length */
49 static const unsigned int utf8_minval[6] = { 0x0, 0x80, 0x800, 0x10000, 0x200000, 0x4000000 };
52 /* query necessary dst length for src string */
53 inline static int get_length_wcs_utf8( const WCHAR *src, unsigned int srclen )
57 for (len = 0; srclen; srclen--, src++, len++)
62 if (*src >= 0x800) len++;
68 /* wide char to UTF-8 string conversion */
69 /* return -1 on dst buffer overflow */
70 int utf8_wcstombs( const WCHAR *src, int srclen, char *dst, int dstlen )
74 if (!dstlen) return get_length_wcs_utf8( src, srclen );
76 for (; srclen; srclen--, src++)
80 if (ch < 0x80) /* 0x00-0x7f: 1 byte */
82 if (!dstlen--) return -1; /* overflow */
87 if (ch < 0x800) /* 0x80-0x7ff: 2 bytes */
89 if ((dstlen -= 2) < 0) return -1; /* overflow */
90 dst[1] = 0x80 | (ch & 0x3f);
97 /* 0x800-0xffff: 3 bytes */
99 if ((dstlen -= 3) < 0) return -1; /* overflow */
100 dst[2] = 0x80 | (ch & 0x3f);
102 dst[1] = 0x80 | (ch & 0x3f);
107 return dst - orig_dst;
110 /* query necessary dst length for src string */
111 inline static int get_length_mbs_utf8( const unsigned char *src, int srclen )
114 const unsigned char *srcend = src + srclen;
116 for (ret = 0; src < srcend; ret++)
118 unsigned char ch = *src++;
119 if (ch < 0xc0) continue;
121 switch(utf8_length[ch-0x80])
124 if (src >= srcend) return ret; /* ignore partial char */
125 if ((ch = *src ^ 0x80) >= 0x40) continue;
128 if (src >= srcend) return ret; /* ignore partial char */
129 if ((ch = *src ^ 0x80) >= 0x40) continue;
132 if (src >= srcend) return ret; /* ignore partial char */
133 if ((ch = *src ^ 0x80) >= 0x40) continue;
136 if (src >= srcend) return ret; /* ignore partial char */
137 if ((ch = *src ^ 0x80) >= 0x40) continue;
140 if (src >= srcend) return ret; /* ignore partial char */
141 if ((ch = *src ^ 0x80) >= 0x40) continue;
148 /* UTF-8 to wide char string conversion */
149 /* return -1 on dst buffer overflow, -2 on invalid input char */
150 int utf8_mbstowcs( int flags, const char *src, int srclen, WCHAR *dst, int dstlen )
154 const char *srcend = src + srclen;
156 if (!dstlen) return get_length_mbs_utf8( src, srclen );
158 for (count = dstlen; count && (src < srcend); count--, dst++)
160 unsigned char ch = *src++;
161 if (ch < 0x80) /* special fast case for 7-bit ASCII */
166 len = utf8_length[ch-0x80];
167 res = ch & utf8_mask[len];
172 if (src >= srcend) goto done; /* ignore partial char */
173 if ((ch = *src ^ 0x80) >= 0x40) goto bad;
174 res = (res << 6) | ch;
177 if (src >= srcend) goto done; /* ignore partial char */
178 if ((ch = *src ^ 0x80) >= 0x40) goto bad;
179 res = (res << 6) | ch;
182 if (src >= srcend) goto done; /* ignore partial char */
183 if ((ch = *src ^ 0x80) >= 0x40) goto bad;
184 res = (res << 6) | ch;
187 if (src >= srcend) goto done; /* ignore partial char */
188 if ((ch = *src ^ 0x80) >= 0x40) goto bad;
189 res = (res << 6) | ch;
192 if (src >= srcend) goto done; /* ignore partial char */
193 if ((ch = *src ^ 0x80) >= 0x40) goto bad;
194 res = (res << 6) | ch;
196 if (res < utf8_minval[len]) goto bad;
197 if (res >= 0x10000) goto bad; /* FIXME: maybe we should do surrogates here */
202 if (flags & MB_ERR_INVALID_CHARS) return -2; /* bad char */
205 if (src < srcend) return -1; /* overflow */
207 return dstlen - count;
212 bstrtoutf8 ( BSTR src, char *dst, size_t dstlen )
214 size_t srclen, needed;
217 srclen = src? SysStringLen (src): 0;
219 needed = srclen? (utf8_wcstombs (src, srclen, NULL, 0) + 1) : 1;
225 n = utf8_wcstombs (src, srclen, dst, dstlen);