1*9a4a12bdSRobert Mustacchi /* 2*9a4a12bdSRobert Mustacchi * This file and its contents are supplied under the terms of the 3*9a4a12bdSRobert Mustacchi * Common Development and Distribution License ("CDDL"), version 1.0. 4*9a4a12bdSRobert Mustacchi * You may only use this file in accordance with the terms of version 5*9a4a12bdSRobert Mustacchi * 1.0 of the CDDL. 6*9a4a12bdSRobert Mustacchi * 7*9a4a12bdSRobert Mustacchi * A full copy of the text of the CDDL should have accompanied this 8*9a4a12bdSRobert Mustacchi * source. A copy of the CDDL is also available via the Internet at 9*9a4a12bdSRobert Mustacchi * http://www.illumos.org/license/CDDL. 10*9a4a12bdSRobert Mustacchi */ 11*9a4a12bdSRobert Mustacchi 12*9a4a12bdSRobert Mustacchi /* 13*9a4a12bdSRobert Mustacchi * Copyright 2020 Robert Mustacchi 14*9a4a12bdSRobert Mustacchi */ 15*9a4a12bdSRobert Mustacchi 16*9a4a12bdSRobert Mustacchi #ifndef _UNICODE_H 17*9a4a12bdSRobert Mustacchi #define _UNICODE_H 18*9a4a12bdSRobert Mustacchi 19*9a4a12bdSRobert Mustacchi /* 20*9a4a12bdSRobert Mustacchi * Common definitions for dealing with Unicode. 21*9a4a12bdSRobert Mustacchi * 22*9a4a12bdSRobert Mustacchi * UTF-16 encodes data as a series of two byte values. However, there are more 23*9a4a12bdSRobert Mustacchi * than 16-bit of code points. Code points inside of the first 16-bits are 24*9a4a12bdSRobert Mustacchi * referred to as existing in the 'basic multilingual plane' (BMP). Those 25*9a4a12bdSRobert Mustacchi * outside of it are in the 'supplementary plane'. When such a code point is 26*9a4a12bdSRobert Mustacchi * encountered, it is encoded as a series of two uint16_t values. 27*9a4a12bdSRobert Mustacchi * 28*9a4a12bdSRobert Mustacchi * A value which is up to 20 bits (the current limit of the unicode code point 29*9a4a12bdSRobert Mustacchi * space) is encoded by splitting it into two 10-bit values. The upper 10 bits 30*9a4a12bdSRobert Mustacchi * are ORed with 0xd800 and the lower 10 bits are ORed with 0xdc00. 31*9a4a12bdSRobert Mustacchi */ 32*9a4a12bdSRobert Mustacchi 33*9a4a12bdSRobert Mustacchi #ifdef __cplusplus 34*9a4a12bdSRobert Mustacchi extern "C" { 35*9a4a12bdSRobert Mustacchi #endif 36*9a4a12bdSRobert Mustacchi 37*9a4a12bdSRobert Mustacchi /* 38*9a4a12bdSRobert Mustacchi * Range of Unicode code points reserved for surrogate characters. 39*9a4a12bdSRobert Mustacchi */ 40*9a4a12bdSRobert Mustacchi #define UNICODE_SUR_MIN 0xd800 41*9a4a12bdSRobert Mustacchi #define UNICODE_SUR_MAX 0xdfff 42*9a4a12bdSRobert Mustacchi 43*9a4a12bdSRobert Mustacchi /* 44*9a4a12bdSRobert Mustacchi * Range of Unicode code points in supplementary planes. 45*9a4a12bdSRobert Mustacchi */ 46*9a4a12bdSRobert Mustacchi #define UNICODE_SUP_START 0x10000 47*9a4a12bdSRobert Mustacchi #define UNICODE_SUP_MAX 0x10ffff 48*9a4a12bdSRobert Mustacchi 49*9a4a12bdSRobert Mustacchi /* 50*9a4a12bdSRobert Mustacchi * Starting constants for surrogate pairs. 51*9a4a12bdSRobert Mustacchi */ 52*9a4a12bdSRobert Mustacchi #define UNICODE_SUR_UPPER 0xd800 53*9a4a12bdSRobert Mustacchi #define UNICODE_SUR_LOWER 0xdc00 54*9a4a12bdSRobert Mustacchi 55*9a4a12bdSRobert Mustacchi /* 56*9a4a12bdSRobert Mustacchi * Macros to extract the value from a surrogate pair and to take a code point 57*9a4a12bdSRobert Mustacchi * and transform it into the surrogate version. 58*9a4a12bdSRobert Mustacchi */ 59*9a4a12bdSRobert Mustacchi #define UNICODE_SUR_UVALUE(x) (((x) & 0x3ff) << 10) 60*9a4a12bdSRobert Mustacchi #define UNICODE_SUR_LVALUE(x) ((x) & 0x3ff) 61*9a4a12bdSRobert Mustacchi #define UNICODE_SUR_UMASK(x) (((x) >> 10) & 0x3ff) 62*9a4a12bdSRobert Mustacchi #define UNICODE_SUR_LMASK(x) ((x) & 0x3ff) 63*9a4a12bdSRobert Mustacchi 64*9a4a12bdSRobert Mustacchi #ifdef __cplusplus 65*9a4a12bdSRobert Mustacchi } 66*9a4a12bdSRobert Mustacchi #endif 67*9a4a12bdSRobert Mustacchi 68*9a4a12bdSRobert Mustacchi #endif /* _UNICODE_H */ 69