archive_string.c (ae5876ea1970031e0c41afb2c85dda9dc2095197) | archive_string.c (a2e802b76bf44204b2ada3935e96cc46d8176c08) |
---|---|
1/*- 2 * Copyright (c) 2003-2011 Tim Kientzle 3 * Copyright (c) 2011-2012 Michihiro NAKAJIMA 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: --- 2284 unchanged lines hidden (view full) --- 2293 /* 2294 * Decode 1-4 bytes depending on the value of the first byte. 2295 */ 2296 ch = (unsigned char)*s; 2297 if (ch == 0) 2298 return (0); /* Standard: return 0 for end-of-string. */ 2299 cnt = utf8_count[ch]; 2300 | 1/*- 2 * Copyright (c) 2003-2011 Tim Kientzle 3 * Copyright (c) 2011-2012 Michihiro NAKAJIMA 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: --- 2284 unchanged lines hidden (view full) --- 2293 /* 2294 * Decode 1-4 bytes depending on the value of the first byte. 2295 */ 2296 ch = (unsigned char)*s; 2297 if (ch == 0) 2298 return (0); /* Standard: return 0 for end-of-string. */ 2299 cnt = utf8_count[ch]; 2300 |
2301 /* Invalide sequence or there are not plenty bytes. */ | 2301 /* Invalid sequence or there are not plenty bytes. */ |
2302 if ((int)n < cnt) { 2303 cnt = (int)n; 2304 for (i = 1; i < cnt; i++) { 2305 if ((s[i] & 0xc0) != 0x80) { 2306 cnt = i; 2307 break; 2308 } 2309 } --- 64 unchanged lines hidden (view full) --- 2374 if ((s[i] & 0xc0) != 0x80) { 2375 cnt = i; 2376 break; 2377 } 2378 } 2379 goto invalid_sequence; 2380 } 2381 | 2302 if ((int)n < cnt) { 2303 cnt = (int)n; 2304 for (i = 1; i < cnt; i++) { 2305 if ((s[i] & 0xc0) != 0x80) { 2306 cnt = i; 2307 break; 2308 } 2309 } --- 64 unchanged lines hidden (view full) --- 2374 if ((s[i] & 0xc0) != 0x80) { 2375 cnt = i; 2376 break; 2377 } 2378 } 2379 goto invalid_sequence; 2380 } 2381 |
2382 /* The code point larger than 0x10FFFF is not leagal | 2382 /* The code point larger than 0x10FFFF is not legal |
2383 * Unicode values. */ 2384 if (wc > UNICODE_MAX) 2385 goto invalid_sequence; 2386 /* Correctly gets a Unicode, returns used bytes. */ 2387 *pwc = wc; 2388 return (cnt); 2389invalid_sequence: 2390 *pwc = UNICODE_R_CHAR;/* set the Replacement Character instead. */ 2391 return (cnt * -1); 2392} 2393 2394static int 2395utf8_to_unicode(uint32_t *pwc, const char *s, size_t n) 2396{ 2397 int cnt; 2398 2399 cnt = _utf8_to_unicode(pwc, s, n); | 2383 * Unicode values. */ 2384 if (wc > UNICODE_MAX) 2385 goto invalid_sequence; 2386 /* Correctly gets a Unicode, returns used bytes. */ 2387 *pwc = wc; 2388 return (cnt); 2389invalid_sequence: 2390 *pwc = UNICODE_R_CHAR;/* set the Replacement Character instead. */ 2391 return (cnt * -1); 2392} 2393 2394static int 2395utf8_to_unicode(uint32_t *pwc, const char *s, size_t n) 2396{ 2397 int cnt; 2398 2399 cnt = _utf8_to_unicode(pwc, s, n); |
2400 /* Any of Surrogate pair is not leagal Unicode values. */ | 2400 /* Any of Surrogate pair is not legal Unicode values. */ |
2401 if (cnt == 3 && IS_SURROGATE_PAIR_LA(*pwc)) 2402 return (-3); 2403 return (cnt); 2404} 2405 2406static inline uint32_t 2407combine_surrogate_pair(uint32_t uc, uint32_t uc2) 2408{ --- 44 unchanged lines hidden (view full) --- 2453 if (cnt > 0) 2454 cnt *= -1; 2455 return (cnt); 2456} 2457 2458/* 2459 * Convert a Unicode code point to a single UTF-8 sequence. 2460 * | 2401 if (cnt == 3 && IS_SURROGATE_PAIR_LA(*pwc)) 2402 return (-3); 2403 return (cnt); 2404} 2405 2406static inline uint32_t 2407combine_surrogate_pair(uint32_t uc, uint32_t uc2) 2408{ --- 44 unchanged lines hidden (view full) --- 2453 if (cnt > 0) 2454 cnt *= -1; 2455 return (cnt); 2456} 2457 2458/* 2459 * Convert a Unicode code point to a single UTF-8 sequence. 2460 * |
2461 * NOTE:This function does not check if the Unicode is leagal or not. | 2461 * NOTE:This function does not check if the Unicode is legal or not. |
2462 * Please you definitely check it before calling this. 2463 */ 2464static size_t 2465unicode_to_utf8(char *p, size_t remaining, uint32_t uc) 2466{ 2467 char *_p = p; 2468 2469 /* Invalid Unicode char maps to Replacement character */ --- 79 unchanged lines hidden (view full) --- 2549 return (-2); 2550 } 2551 } 2552 2553 /* 2554 * Surrogate pair values(0xd800 through 0xdfff) are only 2555 * used by UTF-16, so, after above culculation, the code 2556 * must not be surrogate values, and Unicode has no codes | 2462 * Please you definitely check it before calling this. 2463 */ 2464static size_t 2465unicode_to_utf8(char *p, size_t remaining, uint32_t uc) 2466{ 2467 char *_p = p; 2468 2469 /* Invalid Unicode char maps to Replacement character */ --- 79 unchanged lines hidden (view full) --- 2549 return (-2); 2550 } 2551 } 2552 2553 /* 2554 * Surrogate pair values(0xd800 through 0xdfff) are only 2555 * used by UTF-16, so, after above culculation, the code 2556 * must not be surrogate values, and Unicode has no codes |
2557 * larger than 0x10ffff. Thus, those are not leagal Unicode | 2557 * larger than 0x10ffff. Thus, those are not legal Unicode |
2558 * values. 2559 */ 2560 if (IS_SURROGATE_PAIR_LA(uc) || uc > UNICODE_MAX) { 2561 /* Undescribed code point should be U+FFFD 2562 * (replacement character). */ 2563 *pwc = UNICODE_R_CHAR; 2564 return (((int)(utf16 - s)) * -1); 2565 } --- 1633 unchanged lines hidden --- | 2558 * values. 2559 */ 2560 if (IS_SURROGATE_PAIR_LA(uc) || uc > UNICODE_MAX) { 2561 /* Undescribed code point should be U+FFFD 2562 * (replacement character). */ 2563 *pwc = UNICODE_R_CHAR; 2564 return (((int)(utf16 - s)) * -1); 2565 } --- 1633 unchanged lines hidden --- |