C++-ification of src/lib/util/unicode.[cpp|h]

This commit is contained in:
Nathan Woods 2016-07-02 10:36:39 -04:00
parent 32a38d3f78
commit facb76a669
2 changed files with 81 additions and 81 deletions

View File

@ -11,21 +11,21 @@
#include "unicode.h" #include "unicode.h"
/*------------------------------------------------- //-------------------------------------------------
uchar_isvalid - return true if a given // uchar_isvalid - return true if a given
character is a legitimate unicode character // character is a legitimate unicode character
-------------------------------------------------*/ //-------------------------------------------------
int uchar_isvalid(unicode_char uchar) bool uchar_isvalid(unicode_char uchar)
{ {
return (uchar < 0x110000) && !((uchar >= 0xd800) && (uchar <= 0xdfff)); return (uchar < 0x110000) && !((uchar >= 0xd800) && (uchar <= 0xdfff));
} }
/*------------------------------------------------- //-------------------------------------------------
uchar_from_utf8 - convert a UTF-8 sequence // uchar_from_utf8 - convert a UTF-8 sequence
into a unicode character // into a unicode character
-------------------------------------------------*/ //-----------------------------------------------
int uchar_from_utf8(unicode_char *uchar, const char *utf8char, size_t count) int uchar_from_utf8(unicode_char *uchar, const char *utf8char, size_t count)
{ {
@ -33,74 +33,74 @@ int uchar_from_utf8(unicode_char *uchar, const char *utf8char, size_t count)
int auxlen, i; int auxlen, i;
char auxchar; char auxchar;
/* validate parameters */ // validate parameters
if (utf8char == nullptr || count == 0) if (utf8char == nullptr || count == 0)
return 0; return 0;
/* start with the first byte */ // start with the first byte
c = (unsigned char) *utf8char; c = (unsigned char) *utf8char;
count--; count--;
utf8char++; utf8char++;
/* based on that, determine how many additional bytes we need */ // based on that, determine how many additional bytes we need
if (c < 0x80) if (c < 0x80)
{ {
/* unicode char 0x00000000 - 0x0000007F */ // unicode char 0x00000000 - 0x0000007F
c &= 0x7f; c &= 0x7f;
auxlen = 0; auxlen = 0;
minchar = 0x00000000; minchar = 0x00000000;
} }
else if (c >= 0xc0 && c < 0xe0) else if (c >= 0xc0 && c < 0xe0)
{ {
/* unicode char 0x00000080 - 0x000007FF */ // unicode char 0x00000080 - 0x000007FF
c &= 0x1f; c &= 0x1f;
auxlen = 1; auxlen = 1;
minchar = 0x00000080; minchar = 0x00000080;
} }
else if (c >= 0xe0 && c < 0xf0) else if (c >= 0xe0 && c < 0xf0)
{ {
/* unicode char 0x00000800 - 0x0000FFFF */ // unicode char 0x00000800 - 0x0000FFFF
c &= 0x0f; c &= 0x0f;
auxlen = 2; auxlen = 2;
minchar = 0x00000800; minchar = 0x00000800;
} }
else if (c >= 0xf0 && c < 0xf8) else if (c >= 0xf0 && c < 0xf8)
{ {
/* unicode char 0x00010000 - 0x001FFFFF */ // unicode char 0x00010000 - 0x001FFFFF
c &= 0x07; c &= 0x07;
auxlen = 3; auxlen = 3;
minchar = 0x00010000; minchar = 0x00010000;
} }
else if (c >= 0xf8 && c < 0xfc) else if (c >= 0xf8 && c < 0xfc)
{ {
/* unicode char 0x00200000 - 0x03FFFFFF */ // unicode char 0x00200000 - 0x03FFFFFF
c &= 0x03; c &= 0x03;
auxlen = 4; auxlen = 4;
minchar = 0x00200000; minchar = 0x00200000;
} }
else if (c >= 0xfc && c < 0xfe) else if (c >= 0xfc && c < 0xfe)
{ {
/* unicode char 0x04000000 - 0x7FFFFFFF */ // unicode char 0x04000000 - 0x7FFFFFFF
c &= 0x01; c &= 0x01;
auxlen = 5; auxlen = 5;
minchar = 0x04000000; minchar = 0x04000000;
} }
else else
{ {
/* invalid */ // invalid
return -1; return -1;
} }
/* exceeds the count? */ // exceeds the count?
if (auxlen > count) if (auxlen > count)
return -1; return -1;
/* we now know how long the char is, now compute it */ // we now know how long the char is, now compute it
for (i = 0; i < auxlen; i++) for (i = 0; i < auxlen; i++)
{ {
auxchar = utf8char[i]; auxchar = utf8char[i];
/* all auxillary chars must be between 0x80-0xbf */ // all auxillary chars must be between 0x80-0xbf
if ((auxchar & 0xc0) != 0x80) if ((auxchar & 0xc0) != 0x80)
return -1; return -1;
@ -108,7 +108,7 @@ int uchar_from_utf8(unicode_char *uchar, const char *utf8char, size_t count)
c |= auxchar & 0x3f; c |= auxchar & 0x3f;
} }
/* make sure that this char is above the minimum */ // make sure that this char is above the minimum
if (c < minchar) if (c < minchar)
return -1; return -1;
@ -117,20 +117,20 @@ int uchar_from_utf8(unicode_char *uchar, const char *utf8char, size_t count)
} }
/*------------------------------------------------- //-------------------------------------------------
uchar_from_utf16 - convert a UTF-16 sequence // uchar_from_utf16 - convert a UTF-16 sequence
into a unicode character // into a unicode character
-------------------------------------------------*/ //-------------------------------------------------
int uchar_from_utf16(unicode_char *uchar, const utf16_char *utf16char, size_t count) int uchar_from_utf16(unicode_char *uchar, const utf16_char *utf16char, size_t count)
{ {
int rc = -1; int rc = -1;
/* validate parameters */ // validate parameters
if (utf16char == nullptr || count == 0) if (utf16char == nullptr || count == 0)
return 0; return 0;
/* handle the two-byte case */ // handle the two-byte case
if (utf16char[0] >= 0xd800 && utf16char[0] <= 0xdbff) if (utf16char[0] >= 0xd800 && utf16char[0] <= 0xdbff)
{ {
if (count > 1 && utf16char[1] >= 0xdc00 && utf16char[1] <= 0xdfff) if (count > 1 && utf16char[1] >= 0xdc00 && utf16char[1] <= 0xdfff)
@ -140,7 +140,7 @@ int uchar_from_utf16(unicode_char *uchar, const utf16_char *utf16char, size_t co
} }
} }
/* handle the one-byte case */ // handle the one-byte case
else if (utf16char[0] < 0xdc00 || utf16char[0] > 0xdfff) else if (utf16char[0] < 0xdc00 || utf16char[0] > 0xdfff)
{ {
*uchar = utf16char[0]; *uchar = utf16char[0];
@ -151,11 +151,11 @@ int uchar_from_utf16(unicode_char *uchar, const utf16_char *utf16char, size_t co
} }
/*------------------------------------------------- //-------------------------------------------------
uchar_from_utf16f - convert a UTF-16 sequence // uchar_from_utf16f - convert a UTF-16 sequence
into a unicode character from a flipped // into a unicode character from a flipped
byte order // byte order
-------------------------------------------------*/ //-------------------------------------------------
int uchar_from_utf16f(unicode_char *uchar, const utf16_char *utf16char, size_t count) int uchar_from_utf16f(unicode_char *uchar, const utf16_char *utf16char, size_t count)
{ {
@ -168,30 +168,30 @@ int uchar_from_utf16f(unicode_char *uchar, const utf16_char *utf16char, size_t c
} }
/*------------------------------------------------- //-------------------------------------------------
utf8_from_uchar - convert a unicode character // utf8_from_uchar - convert a unicode character
into a UTF-8 sequence // into a UTF-8 sequence
-------------------------------------------------*/ //-------------------------------------------------
int utf8_from_uchar(char *utf8string, size_t count, unicode_char uchar) int utf8_from_uchar(char *utf8string, size_t count, unicode_char uchar)
{ {
int rc = 0; int rc = 0;
/* error on invalid characters */ // error on invalid characters
if (!uchar_isvalid(uchar)) if (!uchar_isvalid(uchar))
return -1; return -1;
/* based on the value, output the appropriate number of bytes */ // based on the value, output the appropriate number of bytes
if (uchar < 0x80) if (uchar < 0x80)
{ {
/* unicode char 0x00000000 - 0x0000007F */ // unicode char 0x00000000 - 0x0000007F
if (count < 1) if (count < 1)
return -1; return -1;
utf8string[rc++] = (char) uchar; utf8string[rc++] = (char) uchar;
} }
else if (uchar < 0x800) else if (uchar < 0x800)
{ {
/* unicode char 0x00000080 - 0x000007FF */ // unicode char 0x00000080 - 0x000007FF
if (count < 2) if (count < 2)
return -1; return -1;
utf8string[rc++] = ((char) (uchar >> 6)) | 0xC0; utf8string[rc++] = ((char) (uchar >> 6)) | 0xC0;
@ -199,7 +199,7 @@ int utf8_from_uchar(char *utf8string, size_t count, unicode_char uchar)
} }
else if (uchar < 0x10000) else if (uchar < 0x10000)
{ {
/* unicode char 0x00000800 - 0x0000FFFF */ // unicode char 0x00000800 - 0x0000FFFF
if (count < 3) if (count < 3)
return -1; return -1;
utf8string[rc++] = ((char) (uchar >> 12)) | 0xE0; utf8string[rc++] = ((char) (uchar >> 12)) | 0xE0;
@ -208,7 +208,7 @@ int utf8_from_uchar(char *utf8string, size_t count, unicode_char uchar)
} }
else if (uchar < 0x00200000) else if (uchar < 0x00200000)
{ {
/* unicode char 0x00010000 - 0x001FFFFF */ // unicode char 0x00010000 - 0x001FFFFF
if (count < 4) if (count < 4)
return -1; return -1;
utf8string[rc++] = ((char) (uchar >> 18)) | 0xF0; utf8string[rc++] = ((char) (uchar >> 18)) | 0xF0;
@ -218,7 +218,7 @@ int utf8_from_uchar(char *utf8string, size_t count, unicode_char uchar)
} }
else if (uchar < 0x04000000) else if (uchar < 0x04000000)
{ {
/* unicode char 0x00200000 - 0x03FFFFFF */ // unicode char 0x00200000 - 0x03FFFFFF
if (count < 5) if (count < 5)
return -1; return -1;
utf8string[rc++] = ((char) (uchar >> 24)) | 0xF8; utf8string[rc++] = ((char) (uchar >> 24)) | 0xF8;
@ -229,7 +229,7 @@ int utf8_from_uchar(char *utf8string, size_t count, unicode_char uchar)
} }
else if (uchar < 0x80000000) else if (uchar < 0x80000000)
{ {
/* unicode char 0x04000000 - 0x7FFFFFFF */ // unicode char 0x04000000 - 0x7FFFFFFF
if (count < 6) if (count < 6)
return -1; return -1;
utf8string[rc++] = ((char) (uchar >> 30)) | 0xFC; utf8string[rc++] = ((char) (uchar >> 30)) | 0xFC;
@ -246,20 +246,20 @@ int utf8_from_uchar(char *utf8string, size_t count, unicode_char uchar)
} }
/*------------------------------------------------- //-------------------------------------------------
utf16_from_uchar - convert a unicode character // utf16_from_uchar - convert a unicode character
into a UTF-16 sequence // into a UTF-16 sequence
-------------------------------------------------*/ //-------------------------------------------------
int utf16_from_uchar(utf16_char *utf16string, size_t count, unicode_char uchar) int utf16_from_uchar(utf16_char *utf16string, size_t count, unicode_char uchar)
{ {
int rc; int rc;
/* error on invalid characters */ // error on invalid characters
if (!uchar_isvalid(uchar)) if (!uchar_isvalid(uchar))
return -1; return -1;
/* single word case */ // single word case
if (uchar < 0x10000) if (uchar < 0x10000)
{ {
if (count < 1) if (count < 1)
@ -268,7 +268,7 @@ int utf16_from_uchar(utf16_char *utf16string, size_t count, unicode_char uchar)
rc = 1; rc = 1;
} }
/* double word case */ // double word case
else if (uchar < 0x100000) else if (uchar < 0x100000)
{ {
if (count < 2) if (count < 2)
@ -283,10 +283,10 @@ int utf16_from_uchar(utf16_char *utf16string, size_t count, unicode_char uchar)
} }
/*------------------------------------------------- //-------------------------------------------------
utf16_from_uchar - convert a unicode character // utf16_from_uchar - convert a unicode character
into a UTF-16 sequence with flipped endianness // into a UTF-16 sequence with flipped endianness
-------------------------------------------------*/ //-------------------------------------------------
int utf16f_from_uchar(utf16_char *utf16string, size_t count, unicode_char uchar) int utf16f_from_uchar(utf16_char *utf16string, size_t count, unicode_char uchar)
{ {
@ -303,10 +303,10 @@ int utf16f_from_uchar(utf16_char *utf16string, size_t count, unicode_char uchar)
} }
/*------------------------------------------------- //-------------------------------------------------
utf8_previous_char - return a pointer to the // utf8_previous_char - return a pointer to the
previous character in a string // previous character in a string
-------------------------------------------------*/ //-------------------------------------------------
/** /**
* @fn const char *utf8_previous_char(const char *utf8string) * @fn const char *utf8_previous_char(const char *utf8string)
@ -326,11 +326,11 @@ const char *utf8_previous_char(const char *utf8string)
} }
/*------------------------------------------------- //-------------------------------------------------
utf8_is_valid_string - return true if the // utf8_is_valid_string - return true if the
given string is a properly formed sequence of // given string is a properly formed sequence of
UTF-8 characters // UTF-8 characters
-------------------------------------------------*/ //-------------------------------------------------
/** /**
* @fn int utf8_is_valid_string(const char *utf8string) * @fn int utf8_is_valid_string(const char *utf8string)
@ -342,7 +342,7 @@ const char *utf8_previous_char(const char *utf8string)
* @return An int. * @return An int.
*/ */
int utf8_is_valid_string(const char *utf8string) bool utf8_is_valid_string(const char *utf8string)
{ {
int remaining_length = strlen(utf8string); int remaining_length = strlen(utf8string);
@ -351,15 +351,15 @@ int utf8_is_valid_string(const char *utf8string)
unicode_char uchar = 0; unicode_char uchar = 0;
int charlen; int charlen;
/* extract the current character and verify it */ // extract the current character and verify it
charlen = uchar_from_utf8(&uchar, utf8string, remaining_length); charlen = uchar_from_utf8(&uchar, utf8string, remaining_length);
if (charlen <= 0 || uchar == 0 || !uchar_isvalid(uchar)) if (charlen <= 0 || uchar == 0 || !uchar_isvalid(uchar))
return FALSE; return false;
/* advance */ // advance
utf8string += charlen; utf8string += charlen;
remaining_length -= charlen; remaining_length -= charlen;
} }
return TRUE; return true;
} }

View File

@ -29,12 +29,12 @@
CONSTANTS CONSTANTS
***************************************************************************/ ***************************************************************************/
/* these defines specify the maximum size of different types of Unicode // these defines specify the maximum size of different types of Unicode
* character encodings */ // character encodings
#define UTF8_CHAR_MAX 6 #define UTF8_CHAR_MAX 6
#define UTF16_CHAR_MAX 2 #define UTF16_CHAR_MAX 2
/* these are UTF-8 encoded strings for common characters */ // these are UTF-8 encoded strings for common characters
#define UTF8_NBSP "\xc2\xa0" /* non-breaking space */ #define UTF8_NBSP "\xc2\xa0" /* non-breaking space */
#define UTF8_MULTIPLY "\xc3\x97" /* multiplication sign */ #define UTF8_MULTIPLY "\xc3\x97" /* multiplication sign */
@ -87,22 +87,22 @@ typedef UINT32 unicode_char;
FUNCTION PROTOTYPES FUNCTION PROTOTYPES
***************************************************************************/ ***************************************************************************/
/* tests to see if a unicode char is a valid code point */ // tests to see if a unicode char is a valid code point
int uchar_isvalid(unicode_char uchar); bool uchar_isvalid(unicode_char uchar);
/* converting strings to 32-bit Unicode chars */ // converting strings to 32-bit Unicode chars
int uchar_from_utf8(unicode_char *uchar, const char *utf8char, size_t count); int uchar_from_utf8(unicode_char *uchar, const char *utf8char, size_t count);
int uchar_from_utf16(unicode_char *uchar, const utf16_char *utf16char, size_t count); int uchar_from_utf16(unicode_char *uchar, const utf16_char *utf16char, size_t count);
int uchar_from_utf16f(unicode_char *uchar, const utf16_char *utf16char, size_t count); int uchar_from_utf16f(unicode_char *uchar, const utf16_char *utf16char, size_t count);
/* converting 32-bit Unicode chars to strings */ // converting 32-bit Unicode chars to strings
int utf8_from_uchar(char *utf8string, size_t count, unicode_char uchar); int utf8_from_uchar(char *utf8string, size_t count, unicode_char uchar);
int utf16_from_uchar(utf16_char *utf16string, size_t count, unicode_char uchar); int utf16_from_uchar(utf16_char *utf16string, size_t count, unicode_char uchar);
int utf16f_from_uchar(utf16_char *utf16string, size_t count, unicode_char uchar); int utf16f_from_uchar(utf16_char *utf16string, size_t count, unicode_char uchar);
/* misc UTF-8 helpers */ // misc UTF-8 helpers
const char *utf8_previous_char(const char *utf8string); const char *utf8_previous_char(const char *utf8string);
int utf8_is_valid_string(const char *utf8string); bool utf8_is_valid_string(const char *utf8string);