From: Greg Hudson Date: Thu, 14 May 2009 16:16:32 +0000 (+0000) Subject: UCS2 support doesn't handle upper half of BMP X-Git-Tag: krb5-1.8-alpha1~476 X-Git-Url: http://git.tremily.us/?a=commitdiff_plain;h=ee699ef91ba36719e50ce9dc5d54dd3896740917;p=krb5.git UCS2 support doesn't handle upper half of BMP Make krb5_ucs2 an unsigned type. Eliminate the need for distinguished values for ucs2 and ucs4 characters by changing the API of the single- character conversion routines. ticket: 6489 tags: pullup target_version: 1.7 git-svn-id: svn://anonsvn.mit.edu/krb5/trunk@22350 dc483132-0cff-0310-8789-dd5450dbe970 --- diff --git a/src/include/k5-utf8.h b/src/include/k5-utf8.h index b5a394559..e3f134b56 100644 --- a/src/include/k5-utf8.h +++ b/src/include/k5-utf8.h @@ -84,9 +84,9 @@ #endif #if INT_MAX == 0x7fff -typedef int krb5_ucs2; +typedef unsigned int krb5_ucs2; #elif SHRT_MAX == 0x7fff -typedef short krb5_ucs2; +typedef unsigned short krb5_ucs2; #else #error undefined 16 bit type #endif @@ -101,15 +101,12 @@ typedef short krb5_ucs4; #error: undefined 32 bit type #endif -#define KRB5_UCS2_INVALID ((krb5_ucs2)0x8000) -#define KRB5_UCS4_INVALID ((krb5_ucs4)0x80000000) - #define KRB5_MAX_UTF8_LEN (sizeof(krb5_ucs2) * 3/2) -krb5_ucs2 krb5int_utf8_to_ucs2(const char *p); +int krb5int_utf8_to_ucs2(const char *p, krb5_ucs2 *out); size_t krb5int_ucs2_to_utf8(krb5_ucs2 c, char *buf); -krb5_ucs4 krb5int_utf8_to_ucs4(const char *p); +int krb5int_utf8_to_ucs4(const char *p, krb5_ucs4 *out); size_t krb5int_ucs4_to_utf8(krb5_ucs4 c, char *buf); int diff --git a/src/lib/krb5/unicode/ucstr.c b/src/lib/krb5/unicode/ucstr.c index e3c3c800e..ec2368820 100644 --- a/src/lib/krb5/unicode/ucstr.c +++ b/src/lib/krb5/unicode/ucstr.c @@ -397,8 +397,7 @@ krb5int_utf8_normcmp( /* convert and normalize 1st string */ for (i = 0, ulen = 0; i < l1; i += len, ulen++) { - ucs[ulen] = krb5int_utf8_to_ucs4(s1 + i); - if (ucs[ulen] == KRB5_UCS4_INVALID) { + if (krb5int_utf8_to_ucs4(s1 + i, &ucs[ulen]) == -1) { free(ucs); return -1; /* what to do??? */ } @@ -420,8 +419,7 @@ krb5int_utf8_normcmp( /* convert and normalize 2nd string */ for (i = 0, ulen = 0; i < l2; i += len, ulen++) { - ucs[ulen] = krb5int_utf8_to_ucs4(s2 + i); - if (ucs[ulen] == KRB5_UCS4_INVALID) { + if (krb5int_utf8_to_ucs4(s2 + i, &ucs[ulen]) == -1) { free(ucsout1); free(ucs); return 1; /* what to do??? */ diff --git a/src/util/support/utf8.c b/src/util/support/utf8.c index f0d764e4a..4468673dc 100644 --- a/src/util/support/utf8.c +++ b/src/util/support/utf8.c @@ -159,7 +159,11 @@ int krb5int_utf8_charlen2(const char *p) return i; } -krb5_ucs4 krb5int_utf8_to_ucs4(const char *p) +/* + * Convert a UTF8 character to a UCS4 character. Return 0 on success, + * -1 on failure. + */ +int krb5int_utf8_to_ucs4(const char *p, krb5_ucs4 *out) { const unsigned char *c = (const unsigned char *) p; krb5_ucs4 ch; @@ -167,33 +171,35 @@ krb5_ucs4 krb5int_utf8_to_ucs4(const char *p) static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; + *out = 0; len = KRB5_UTF8_CHARLEN2(p, len); if (len == 0) - return KRB5_UCS4_INVALID; + return -1; ch = c[0] & mask[len]; for (i = 1; i < len; i++) { - if ((c[i] & 0xc0) != 0x80) { - return KRB5_UCS4_INVALID; - } + if ((c[i] & 0xc0) != 0x80) + return -1; ch <<= 6; ch |= c[i] & 0x3f; } - return ch; + *out = ch; + return 0; } -krb5_ucs2 krb5int_utf8_to_ucs2(const char *p) +int krb5int_utf8_to_ucs2(const char *p, krb5_ucs2 *out) { - krb5_ucs4 ch = krb5int_utf8_to_ucs4(p); - - if (ch == KRB5_UCS4_INVALID || ch > SHRT_MAX) - return KRB5_UCS2_INVALID; + krb5_ucs4 ch; - return (krb5_ucs2)ch; + *out = 0; + if (krb5int_utf8_to_ucs4(p, &ch) == -1 || ch > 0xFFFF) + return -1; + *out = (krb5_ucs2) ch; + return 0; } /* conv UCS-2 to UTF-8, not used */ @@ -446,10 +452,13 @@ int krb5int_utf8_isupper(const char * p) /* like strchr() */ char *krb5int_utf8_strchr(const char *str, const char *chr) { + krb5_ucs4 chs, ch; + + if (krb5int_utf8_to_ucs4(chr, &ch) == -1) + return NULL; for ( ; *str != '\0'; KRB5_UTF8_INCR(str)) { - if (krb5int_utf8_to_ucs4(str) == krb5int_utf8_to_ucs4(chr)) { + if (krb5int_utf8_to_ucs4(str, &chs) == 0 && chs == ch) return (char *)str; - } } return NULL; @@ -458,14 +467,14 @@ char *krb5int_utf8_strchr(const char *str, const char *chr) /* like strcspn() but returns number of bytes, not characters */ size_t krb5int_utf8_strcspn(const char *str, const char *set) { - const char *cstr; - const char *cset; + const char *cstr, *cset; + krb5_ucs4 chstr, chset; for (cstr = str; *cstr != '\0'; KRB5_UTF8_INCR(cstr)) { for (cset = set; *cset != '\0'; KRB5_UTF8_INCR(cset)) { - if (krb5int_utf8_to_ucs4(cstr) == krb5int_utf8_to_ucs4(cset)) { + if (krb5int_utf8_to_ucs4(cstr, &chstr) == 0 + && krb5int_utf8_to_ucs4(cset, &chset) == 0 && chstr == chset) return cstr - str; - } } } @@ -475,18 +484,16 @@ size_t krb5int_utf8_strcspn(const char *str, const char *set) /* like strspn() but returns number of bytes, not characters */ size_t krb5int_utf8_strspn(const char *str, const char *set) { - const char *cstr; - const char *cset; + const char *cstr, *cset; + krb5_ucs4 chstr, chset; for (cstr = str; *cstr != '\0'; KRB5_UTF8_INCR(cstr)) { for (cset = set; ; KRB5_UTF8_INCR(cset)) { - if (*cset == '\0') { + if (*cset == '\0') return cstr - str; - } - - if (krb5int_utf8_to_ucs4(cstr) == krb5int_utf8_to_ucs4(cset)) { + if (krb5int_utf8_to_ucs4(cstr, &chstr) == 0 + && krb5int_utf8_to_ucs4(cset, &chset) == 0 && chstr == chset) break; - } } } @@ -496,13 +503,14 @@ size_t krb5int_utf8_strspn(const char *str, const char *set) /* like strpbrk(), replaces strchr() as well */ char *krb5int_utf8_strpbrk(const char *str, const char *set) { - for ( ; *str != '\0'; KRB5_UTF8_INCR(str)) { - const char *cset; + const char *cset; + krb5_ucs4 chstr, chset; + for ( ; *str != '\0'; KRB5_UTF8_INCR(str)) { for (cset = set; *cset != '\0'; KRB5_UTF8_INCR(cset)) { - if (krb5int_utf8_to_ucs4(str) == krb5int_utf8_to_ucs4(cset)) { + if (krb5int_utf8_to_ucs4(str, &chstr) == 0 + && krb5int_utf8_to_ucs4(cset, &chset) == 0 && chstr == chset) return (char *)str; - } } }