archive_string.c (ae5876ea1970031e0c41afb2c85dda9dc2095197) archive_string.c (a2e802b76bf44204b2ada3935e96cc46d8176c08)
1/*-
2 * Copyright (c) 2003-2011 Tim Kientzle
3 * Copyright (c) 2011-2012 Michihiro NAKAJIMA
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:

--- 2284 unchanged lines hidden (view full) ---

2293 /*
2294 * Decode 1-4 bytes depending on the value of the first byte.
2295 */
2296 ch = (unsigned char)*s;
2297 if (ch == 0)
2298 return (0); /* Standard: return 0 for end-of-string. */
2299 cnt = utf8_count[ch];
2300
1/*-
2 * Copyright (c) 2003-2011 Tim Kientzle
3 * Copyright (c) 2011-2012 Michihiro NAKAJIMA
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:

--- 2284 unchanged lines hidden (view full) ---

2293 /*
2294 * Decode 1-4 bytes depending on the value of the first byte.
2295 */
2296 ch = (unsigned char)*s;
2297 if (ch == 0)
2298 return (0); /* Standard: return 0 for end-of-string. */
2299 cnt = utf8_count[ch];
2300
2301 /* Invalide sequence or there are not plenty bytes. */
2301 /* Invalid sequence or there are not plenty bytes. */
2302 if ((int)n < cnt) {
2303 cnt = (int)n;
2304 for (i = 1; i < cnt; i++) {
2305 if ((s[i] & 0xc0) != 0x80) {
2306 cnt = i;
2307 break;
2308 }
2309 }

--- 64 unchanged lines hidden (view full) ---

2374 if ((s[i] & 0xc0) != 0x80) {
2375 cnt = i;
2376 break;
2377 }
2378 }
2379 goto invalid_sequence;
2380 }
2381
2302 if ((int)n < cnt) {
2303 cnt = (int)n;
2304 for (i = 1; i < cnt; i++) {
2305 if ((s[i] & 0xc0) != 0x80) {
2306 cnt = i;
2307 break;
2308 }
2309 }

--- 64 unchanged lines hidden (view full) ---

2374 if ((s[i] & 0xc0) != 0x80) {
2375 cnt = i;
2376 break;
2377 }
2378 }
2379 goto invalid_sequence;
2380 }
2381
2382 /* The code point larger than 0x10FFFF is not leagal
2382 /* The code point larger than 0x10FFFF is not legal
2383 * Unicode values. */
2384 if (wc > UNICODE_MAX)
2385 goto invalid_sequence;
2386 /* Correctly gets a Unicode, returns used bytes. */
2387 *pwc = wc;
2388 return (cnt);
2389invalid_sequence:
2390 *pwc = UNICODE_R_CHAR;/* set the Replacement Character instead. */
2391 return (cnt * -1);
2392}
2393
2394static int
2395utf8_to_unicode(uint32_t *pwc, const char *s, size_t n)
2396{
2397 int cnt;
2398
2399 cnt = _utf8_to_unicode(pwc, s, n);
2383 * Unicode values. */
2384 if (wc > UNICODE_MAX)
2385 goto invalid_sequence;
2386 /* Correctly gets a Unicode, returns used bytes. */
2387 *pwc = wc;
2388 return (cnt);
2389invalid_sequence:
2390 *pwc = UNICODE_R_CHAR;/* set the Replacement Character instead. */
2391 return (cnt * -1);
2392}
2393
2394static int
2395utf8_to_unicode(uint32_t *pwc, const char *s, size_t n)
2396{
2397 int cnt;
2398
2399 cnt = _utf8_to_unicode(pwc, s, n);
2400 /* Any of Surrogate pair is not leagal Unicode values. */
2400 /* Any of Surrogate pair is not legal Unicode values. */
2401 if (cnt == 3 && IS_SURROGATE_PAIR_LA(*pwc))
2402 return (-3);
2403 return (cnt);
2404}
2405
2406static inline uint32_t
2407combine_surrogate_pair(uint32_t uc, uint32_t uc2)
2408{

--- 44 unchanged lines hidden (view full) ---

2453 if (cnt > 0)
2454 cnt *= -1;
2455 return (cnt);
2456}
2457
2458/*
2459 * Convert a Unicode code point to a single UTF-8 sequence.
2460 *
2401 if (cnt == 3 && IS_SURROGATE_PAIR_LA(*pwc))
2402 return (-3);
2403 return (cnt);
2404}
2405
2406static inline uint32_t
2407combine_surrogate_pair(uint32_t uc, uint32_t uc2)
2408{

--- 44 unchanged lines hidden (view full) ---

2453 if (cnt > 0)
2454 cnt *= -1;
2455 return (cnt);
2456}
2457
2458/*
2459 * Convert a Unicode code point to a single UTF-8 sequence.
2460 *
2461 * NOTE:This function does not check if the Unicode is leagal or not.
2461 * NOTE:This function does not check if the Unicode is legal or not.
2462 * Please you definitely check it before calling this.
2463 */
2464static size_t
2465unicode_to_utf8(char *p, size_t remaining, uint32_t uc)
2466{
2467 char *_p = p;
2468
2469 /* Invalid Unicode char maps to Replacement character */

--- 79 unchanged lines hidden (view full) ---

2549 return (-2);
2550 }
2551 }
2552
2553 /*
2554 * Surrogate pair values(0xd800 through 0xdfff) are only
2555 * used by UTF-16, so, after above culculation, the code
2556 * must not be surrogate values, and Unicode has no codes
2462 * Please you definitely check it before calling this.
2463 */
2464static size_t
2465unicode_to_utf8(char *p, size_t remaining, uint32_t uc)
2466{
2467 char *_p = p;
2468
2469 /* Invalid Unicode char maps to Replacement character */

--- 79 unchanged lines hidden (view full) ---

2549 return (-2);
2550 }
2551 }
2552
2553 /*
2554 * Surrogate pair values(0xd800 through 0xdfff) are only
2555 * used by UTF-16, so, after above culculation, the code
2556 * must not be surrogate values, and Unicode has no codes
2557 * larger than 0x10ffff. Thus, those are not leagal Unicode
2557 * larger than 0x10ffff. Thus, those are not legal Unicode
2558 * values.
2559 */
2560 if (IS_SURROGATE_PAIR_LA(uc) || uc > UNICODE_MAX) {
2561 /* Undescribed code point should be U+FFFD
2562 * (replacement character). */
2563 *pwc = UNICODE_R_CHAR;
2564 return (((int)(utf16 - s)) * -1);
2565 }

--- 1633 unchanged lines hidden ---
2558 * values.
2559 */
2560 if (IS_SURROGATE_PAIR_LA(uc) || uc > UNICODE_MAX) {
2561 /* Undescribed code point should be U+FFFD
2562 * (replacement character). */
2563 *pwc = UNICODE_R_CHAR;
2564 return (((int)(utf16 - s)) * -1);
2565 }

--- 1633 unchanged lines hidden ---