1 /* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil -*- */ 2 /* util/support/utf8.c */ 3 /* 4 * Copyright 2008 by the Massachusetts Institute of Technology. 5 * All Rights Reserved. 6 * 7 * Export of this software from the United States of America may 8 * require a specific license from the United States Government. 9 * It is the responsibility of any person or organization contemplating 10 * export to obtain such a license before exporting. 11 * 12 * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and 13 * distribute this software and its documentation for any purpose and 14 * without fee is hereby granted, provided that the above copyright 15 * notice appear in all copies and that both that copyright notice and 16 * this permission notice appear in supporting documentation, and that 17 * the name of M.I.T. not be used in advertising or publicity pertaining 18 * to distribution of the software without specific, written prior 19 * permission. Furthermore if you modify this software you must label 20 * your software as modified software and not distribute it in such a 21 * fashion that it might be confused with the original M.I.T. software. 22 * M.I.T. makes no representations about the suitability of 23 * this software for any purpose. It is provided "as is" without express 24 * or implied warranty. 25 */ 26 /* 27 * Copyright 1998-2008 The OpenLDAP Foundation. 28 * All rights reserved. 29 * 30 * Redistribution and use in source and binary forms, with or without 31 * modification, are permitted only as authorized by the OpenLDAP 32 * Public License. 33 * 34 * A copy of this license is available in the file LICENSE in the 35 * top-level directory of the distribution or, alternatively, at 36 * <https://www.OpenLDAP.org/license.html>. 37 */ 38 39 /* This work is part of OpenLDAP Software <https://www.openldap.org/>. */ 40 41 /* Basic UTF-8 routines 42 * 43 * These routines are "dumb". Though they understand UTF-8, 44 * they don't grok Unicode. That is, they can push bits, 45 * but don't have a clue what the bits represent. That's 46 * good enough for use with the KRB5 Client SDK. 47 * 48 * These routines are not optimized. 49 */ 50 51 #include "k5-platform.h" 52 #include "k5-utf8.h" 53 #include "supp-int.h" 54 55 /* 56 * Returns length indicated by first byte. 57 */ 58 const char krb5int_utf8_lentab[] = { 59 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 61 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 62 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 63 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 64 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 65 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 66 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 67 68 /* 69 * Make sure the UTF-8 char used the shortest possible encoding 70 * returns charlen if valid, 0 if not. 71 * 72 * Here are the valid UTF-8 encodings, taken from RFC 3629 page 4. 73 * The table is slightly modified from that of the RFC. 74 * 75 * UCS-4 range (hex) UTF-8 sequence (binary) 76 * 0000 0000-0000 007F 0....... 77 * 0000 0080-0000 07FF 110++++. 10...... 78 * 0000 0800-0000 FFFF 1110++++ 10+..... 10...... 79 * 0001 0000-0010 FFFF 11110+++ 10++.... 10...... 10...... 80 * 81 * The '.' bits are "don't cares". When validating a UTF-8 sequence, 82 * at least one of the '+' bits must be set, otherwise the character 83 * should have been encoded in fewer octets. Note that in the two-octet 84 * case, only the first octet needs to be validated, and this is done 85 * in the krb5int_utf8_lentab[] above. 86 */ 87 88 /* mask of required bits in second octet */ 89 #undef c 90 #define c const char 91 c krb5int_utf8_mintab[] = { 92 (c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, 93 (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, 94 (c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x00, (c)0x00, (c)0x00, 95 (c)0x00, (c)0x00, (c)0x00, (c)0x00, (c)0x00, (c)0x00, (c)0x00, (c)0x00 }; 96 #undef c 97 98 /* 99 * Convert a UTF8 character to a UCS4 character. Return 0 on success, 100 * -1 on failure. 101 */ 102 int krb5int_utf8_to_ucs4(const char *p, krb5_ucs4 *out) 103 { 104 const unsigned char *c = (const unsigned char *) p; 105 krb5_ucs4 ch; 106 int len, i; 107 static unsigned char mask[] = { 108 0, 0x7f, 0x1f, 0x0f, 0x07 }; 109 110 *out = 0; 111 len = KRB5_UTF8_CHARLEN2(p, len); 112 113 if (len == 0) 114 return -1; 115 116 ch = c[0] & mask[len]; 117 118 for (i = 1; i < len; i++) { 119 if ((c[i] & 0xc0) != 0x80) 120 return -1; 121 122 ch <<= 6; 123 ch |= c[i] & 0x3f; 124 } 125 126 if (ch > 0x10ffff) 127 return -1; 128 129 *out = ch; 130 return 0; 131 } 132 133 /* conv UCS-4 to UTF-8 */ 134 size_t krb5int_ucs4_to_utf8(krb5_ucs4 c, char *buf) 135 { 136 size_t len = 0; 137 unsigned char *p = (unsigned char *) buf; 138 139 /* not a valid Unicode character */ 140 if (c > 0x10ffff) 141 return 0; 142 143 /* Just return length, don't convert */ 144 if (buf == NULL) { 145 if (c < 0x80) return 1; 146 else if (c < 0x800) return 2; 147 else if (c < 0x10000) return 3; 148 else return 4; 149 } 150 151 if (c < 0x80) { 152 p[len++] = c; 153 } else if (c < 0x800) { 154 p[len++] = 0xc0 | ( c >> 6 ); 155 p[len++] = 0x80 | ( c & 0x3f ); 156 } else if (c < 0x10000) { 157 p[len++] = 0xe0 | ( c >> 12 ); 158 p[len++] = 0x80 | ( (c >> 6) & 0x3f ); 159 p[len++] = 0x80 | ( c & 0x3f ); 160 } else /* if (c < 0x110000) */ { 161 p[len++] = 0xf0 | ( c >> 18 ); 162 p[len++] = 0x80 | ( (c >> 12) & 0x3f ); 163 p[len++] = 0x80 | ( (c >> 6) & 0x3f ); 164 p[len++] = 0x80 | ( c & 0x3f ); 165 } 166 167 return len; 168 } 169