diff --git a/README.md b/README.md index af6d18b..a77ba33 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,7 @@ Among the features achieved are: expect '\0' terminated char buffers - Improved overall performance of common string operations - Functional equivalency with other more modern languages +- Optional API for manipulating UTF-8 encoded strings ## bstring fork @@ -27,8 +28,7 @@ features (or mis-features, depending on your point of view) are included: 2. Improved test suite using the [Check][] library 3. Continuous integration via GitHub Actions, including memory profiling with [Valgrind][] 4. Remove C++ wrapper code, returning this to a pure C library -5. No UTF8 string manipulation support -6. Documentation generation with [Doxygen][] +5. Documentation generation with [Doxygen][] Currently this fork should be binary-compatible with the original code. The only source incompatibility is the removal of the `const_bstring` type. diff --git a/bstring/buniutil.c b/bstring/buniutil.c new file mode 100644 index 0000000..fa6d37e --- /dev/null +++ b/bstring/buniutil.c @@ -0,0 +1,339 @@ +/* Copyright 2002-2015 Paul Hsieh + * This file is part of Bstrlib. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of bstrlib nor the names of its contributors may be + * used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Alternatively, the contents of this file may be used under the terms of + * GNU General Public License Version 2 (the "GPL"). + */ + +/* + * buniutil.c + * + * This file is not necessarily part of the core bstring library itself, but + * is an implementation of basic UTF-8 processing for bstrings. This module + * depends on bstrlib.c and utf8util.c. + */ + +#include "bstrlib.h" +#include "buniutil.h" + +#define UNICODE__CODE_POINT__REPLACEMENT_CHARACTER (0xFFFDL) + +/* int buIsUTF8Content (const bstring bu) + * + * Scan string and return 1 if its entire contents is entirely UTF-8 code + * points. Otherwise return 0. + */ +int +buIsUTF8Content(const bstring bu) +{ + struct utf8Iterator iter; + + if (NULL == bdata(bu)) return 0; + for (utf8IteratorInit(&iter, bu->data, bu->slen); + iter.next < iter.slen;) { + if (0 >= utf8IteratorGetNextCodePoint(&iter, -1)) return 0; + } + return 1; +} + +/* int buGetBlkUTF16 (cpUcs2 *ucs2, int len, cpUcs4 errCh, + * const bstring bu, int pos) + * + * Convert a string of UTF-8 code points (bu) skipping the first pos code + * points, into a sequence of UTF-16 encoded code points. Returns the + * number of UCS-2 16-bit words written to the output. No more than len + * words are written to the target array ucs2. If any code point in bu is + * unparsable, it will be translated to errCh. + */ +int +buGetBlkUTF16(/* @out */ cpUcs2 *ucs2, int len, cpUcs4 errCh, + const bstring bu, int pos) +{ + struct tagbstring t; + struct utf8Iterator iter; + cpUcs4 ucs4; + int i; + int j; + + if (!isLegalUnicodeCodePoint(errCh)) + errCh = UNICODE__CODE_POINT__REPLACEMENT_CHARACTER; + if (NULL == ucs2 || 0 >= len || NULL == bdata(bu) || 0 > pos) + return BSTR_ERR; + + for (j=0, i=0; j < bu->slen; j++) { + if (0x80 != (0xC0 & bu->data[j])) { + if (i >= pos) break; + i++; + } + } + + t.mlen = -1; + t.data = bu->data + j; + t.slen = bu->slen - j; + + utf8IteratorInit(&iter, t.data, t.slen); + + ucs4 = BSTR_ERR; + for (i=0; 0 < len && iter.next < iter.slen; i++) { + ucs4 = utf8IteratorGetNextCodePoint(&iter, errCh); + if (0 > ucs4) break; + + if (ucs4 < 0x10000) { + *ucs2++ = (cpUcs2) ucs4; + len--; + } else { + if (len < 2) { + *ucs2++ = UNICODE__CODE_POINT__REPLACEMENT_CHARACTER; + len--; + } else { + long y = ucs4 - 0x10000; + ucs2[0] = (cpUcs2) (0xD800 | (y >> 10)); + ucs2[1] = (cpUcs2) (0xDC00 | (y & 0x03FF)); + len -= 2; + ucs2 += 2; + i++; + } + } + } + while (0 < len) { + *ucs2++ = 0; + len--; + } + + utf8IteratorUninit(&iter); + if (0 > ucs4) return BSTR_ERR; + return i; +} + +/* + +Unicode UTF-8 +------- ----- +U-00000000 - U-0000007F: 0xxxxxxx +U-00000080 - U-000007FF: 110xxxxx 10xxxxxx +U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx +U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + +U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx +U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + +UTF-32: U-000000 - U-10FFFF + +*/ + +/* int buAppendBlkUcs4 (bstring b, const cpUcs4 *bu, int len, cpUcs4 errCh) + * + * Convert an array of UCS-4 code points (bu) to UTF-8 code points and + * append to b. Any invalid code point is replaced by errCh. If errCh is + * itself not a valid code point, then this translation will halt upon the + * first error and return BSTR_ERR. Otherwise BSTR_OK is returned. + */ +int +buAppendBlkUcs4(bstring b, const cpUcs4 *bu, int len, cpUcs4 errCh) +{ + int oldSlen; + + if (NULL == bu || NULL == b || 0 > len) return BSTR_ERR; + oldSlen = blengthe(b, -1); + if (0 > oldSlen) return BSTR_ERR; + if (!isLegalUnicodeCodePoint(errCh)) errCh = ~0; + + for (int i=0; i < len; i++) { + unsigned char c[6]; + cpUcs4 v = bu[i]; + + if (!isLegalUnicodeCodePoint(v)) { + if (~0 == errCh) { + b->slen = oldSlen; + return BSTR_ERR; + } + v = errCh; + } + + if (v < 0x80) { + if (BSTR_OK != bconchar(b, (char) v)) { + b->slen = oldSlen; + return BSTR_ERR; + } + } else if (v < 0x800) { + c[0] = (unsigned char) ( (v >> 6) + 0xc0); + c[1] = (unsigned char) (( v & 0x3f) + 0x80); + if (BSTR_OK != bcatblk(b, c, 2)) { + b->slen = oldSlen; + return BSTR_ERR; + } + } else if (v < 0x10000) { + c[0] = (unsigned char) ( (v >> 12) + 0xe0); + c[1] = (unsigned char) (((v >> 6) & 0x3f) + 0x80); + c[2] = (unsigned char) (( v & 0x3f) + 0x80); + if (BSTR_OK != bcatblk(b, c, 3)) { + b->slen = oldSlen; + return BSTR_ERR; + } + } else +#if 0 + if (v < 0x200000) +#endif + { + c[0] = (unsigned char) ( (v >> 18) + 0xf0); + c[1] = (unsigned char) (((v >> 12) & 0x3f) + 0x80); + c[2] = (unsigned char) (((v >> 6) & 0x3f) + 0x80); + c[3] = (unsigned char) (( v & 0x3f) + 0x80); + if (BSTR_OK != bcatblk(b, c, 4)) { + b->slen = oldSlen; + return BSTR_ERR; + } + } +#if 0 + else if (v < 0x4000000) { + c[0] = (unsigned char) ( (v >> 24) + 0xf8); + c[1] = (unsigned char) (((v >> 18) & 0x3f) + 0x80); + c[2] = (unsigned char) (((v >> 12) & 0x3f) + 0x80); + c[3] = (unsigned char) (((v >> 6) & 0x3f) + 0x80); + c[4] = (unsigned char) (( v & 0x3f) + 0x80); + if (BSTR_OK != bcatblk(b, c, 5)) { + b->slen = oldSlen; + return BSTR_ERR; + } + } else { + c[0] = (unsigned char) ( (v >> 30) + 0xfc); + c[1] = (unsigned char) (((v >> 24) & 0x3f) + 0x80); + c[2] = (unsigned char) (((v >> 18) & 0x3f) + 0x80); + c[3] = (unsigned char) (((v >> 12) & 0x3f) + 0x80); + c[4] = (unsigned char) (((v >> 6) & 0x3f) + 0x80); + c[5] = (unsigned char) (( v & 0x3f) + 0x80); + if (BSTR_OK != bcatblk(b, c, 6)) { + b->slen = oldSlen; + return BSTR_ERR; + } + } +#endif + } + return BSTR_OK; +} + +#define endSwap(cs, mode) \ + ((mode) ? ((((cs) & 0xFF) << 8) | (((cs) >> 8) & 0xFF)) : (cs)) +#define TEMP_UCS4_BUFFER_SIZE (64) + +/* int buAppendBlkUTF16 (bstring bu, const cpUcs2 *utf16, int len, + * cpUcs2 *bom, cpUcs4 errCh) + * + * Append an array of UCS-2 code units (utf16) as UTF-8 to bstring bu. + * Any invalid code point is replaced by errCh. If errCh is itself not a + * valid code point, then this translation will halt upon the first error + * and return BSTR_ERR. Otherwise BSTR_OK is returned. If a byte order + * mark has been previously read, it may be passed in as bom, otherwise if + * *bom is set to 0, it will be filled in with the BOM as read from the + * first character if it is a BOM. + */ +int +buAppendBlkUTF16(bstring bu, const cpUcs2 *utf16, int len, cpUcs2 *bom, + cpUcs4 errCh) +{ + cpUcs4 buff[TEMP_UCS4_BUFFER_SIZE]; + int cc; + int i; + int sm; + int oldSlen; + + if (NULL == bdata(bu) || NULL == utf16 || len < 0) return BSTR_ERR; + if (!isLegalUnicodeCodePoint(errCh)) errCh = ~0; + if (len == 0) return BSTR_OK; + + oldSlen = bu->slen; + i = 0; + + /* Check for BOM character and select endianness. Also remove the + BOM from the stream, since there is no need for it in UTF-8. */ + if (bom && (cpUcs2) 0xFFFE == *bom) { + sm = 8; + } else if (bom && (cpUcs2) 0xFEFF == *bom) { + sm = 0; + } else if (utf16[i] == (cpUcs2) 0xFFFE) { + if (bom) *bom = utf16[i]; + sm = 8; + i++; + } else if (utf16[i] == (cpUcs2) 0xFEFF) { + if (bom) *bom = utf16[i]; + sm = 0; + i++; + } else { + sm = 0; /* Assume local endianness. */ + } + + cc = 0; + while (i < len) { + cpUcs4 v; + int invalid = 0; + + v = endSwap(utf16[i], sm); + i++; + + if ((v | 0x7FF) == 0xDFFF) { /* Deal with surrogate pairs */ + if (v >= 0xDC00) { + invalid = 1; /* Isolated low surrogate */ + } else if (i >= len) { + invalid = 1; /* Unterminated high surrogate */ + } else { + cpUcs4 c = endSwap(utf16[i], sm); + if (c < 0xDC00 || c > 0xDFFF) { + invalid = 1; + } else { + i++; + v = ((v - 0xD800) << 10) + (c - 0xDC00) + 0x10000; + } + } + } + + if (invalid) { + if (~0 == errCh) { + bu->slen = oldSlen; + return BSTR_ERR; + } + v = errCh; + } + + buff[cc] = v; + cc++; + if (cc >= TEMP_UCS4_BUFFER_SIZE) { + if (0 > buAppendBlkUcs4(bu, buff, cc, errCh)) { + bu->slen = oldSlen; + return BSTR_ERR; + } + cc = 0; + } + } + if (cc > 0 && 0 > buAppendBlkUcs4(bu, buff, cc, errCh)) { + bu->slen = oldSlen; + return BSTR_ERR; + } + + return BSTR_OK; +} diff --git a/bstring/buniutil.h b/bstring/buniutil.h new file mode 100644 index 0000000..7114991 --- /dev/null +++ b/bstring/buniutil.h @@ -0,0 +1,100 @@ +/* Copyright 2002-2015 Paul Hsieh + * This file is part of Bstrlib. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of bstrlib nor the names of its contributors may be + * used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Alternatively, the contents of this file may be used under the terms of + * GNU General Public License Version 2 (the "GPL"). + */ + +/** + * \file + * \brief Interface for basic Unicode utility functions for bstrings. + * + * Depends on bstrlib.h and utf8util.h. + */ + +#ifndef BSTRLIB_UNICODE_UTILITIES +#define BSTRLIB_UNICODE_UTILITIES + +#include "bstrlib.h" +#include "utf8util.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Scan a bstring and return 1 if its entire content consists of valid UTF-8 + * encoded code points, otherwise return 0. + */ +BSTR_PUBLIC int +buIsUTF8Content(const bstring bu); + +/** + * Convert an array of UCS-4 code points (bu, len elements) to UTF-8 and + * append the result to the bstring b. + * + * Any invalid code point is replaced by errCh. If errCh is itself not a + * valid code point, translation halts on the first error and BSTR_ERR is + * returned. Otherwise BSTR_OK is returned. + */ +BSTR_PUBLIC int +buAppendBlkUcs4(bstring b, const cpUcs4 *bu, int len, cpUcs4 errCh); + +/* For those unfortunate enough to be stuck supporting UTF-16. */ + +/** + * Convert the UTF-8 bstring bu (starting at code-point offset pos) to a + * sequence of UTF-16 encoded code units written to ucs2 (at most len units). + * + * Returns the number of UCS-2 16-bit words written. Any unparsable code + * point is translated to errCh. + */ +BSTR_PUBLIC int +buGetBlkUTF16(/* @out */ cpUcs2 *ucs2, int len, cpUcs4 errCh, + const bstring bu, int pos); + +/** + * Append an array of UTF-16 code units (utf16, len elements) to the UTF-8 + * bstring bu. + * + * Any invalid code point is replaced by errCh. If errCh is itself not a + * valid code point, translation halts on the first error and BSTR_ERR is + * returned. Otherwise BSTR_OK is returned. If a byte order mark has been + * previously read it may be passed in via bom; if *bom is 0 it will be + * filled in from the first character if it is a BOM. + */ +BSTR_PUBLIC int +buAppendBlkUTF16(bstring bu, const cpUcs2 *utf16, int len, cpUcs2 *bom, + cpUcs4 errCh); + +#ifdef __cplusplus +} +#endif + +#endif /* BSTRLIB_UNICODE_UTILITIES */ diff --git a/bstring/meson.build b/bstring/meson.build index 01a8043..bb46e8c 100644 --- a/bstring/meson.build +++ b/bstring/meson.build @@ -1,8 +1,16 @@ -install_headers(['bstraux.h', 'bstrlib.h']) +bstring_sources = ['bstraux.c', 'bstrlib.c'] +bstring_headers = ['bstraux.h', 'bstrlib.h'] + +if get_option('enable-utf8') + bstring_sources += ['buniutil.c', 'utf8util.c'] + bstring_headers += ['buniutil.h', 'utf8util.h'] +endif + +install_headers(bstring_headers) libbstring = library( meson.project_name(), - ['bstraux.c', 'bstrlib.c'], + bstring_sources, version: meson.project_version(), soversion: '1', include_directories: bstring_inc, diff --git a/bstring/utf8util.c b/bstring/utf8util.c new file mode 100644 index 0000000..8362d25 --- /dev/null +++ b/bstring/utf8util.c @@ -0,0 +1,357 @@ +/* Copyright 2002-2015 Paul Hsieh + * This file is part of Bstrlib. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of bstrlib nor the names of its contributors may be + * used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Alternatively, the contents of this file may be used under the terms of + * GNU General Public License Version 2 (the "GPL"). + */ + +/* + * utf8util.c + * + * This file is not necessarily part of the core bstring library itself, but + * is a generic module for implementing UTF-8 utility functions. + */ + +#include "utf8util.h" + +#ifndef NULL +#ifdef __cplusplus +#define NULL 0 +#else +#define NULL ((void *)0) +#endif +#endif + +/* Surrogate range is wrong, there is a maximum, the BOM alias is illegal + and 0xFFFF is illegal */ +#define isLegalUnicodeCodePoint(v) \ + ((((v) < 0xD800L) || ((v) > 0xDFFFL)) && \ + (((unsigned long)(v)) <= 0x0010FFFFL) && \ + (((v)|0x1F0001) != 0x1FFFFFL)) + +void +utf8IteratorInit(struct utf8Iterator *iter, unsigned char *data, int slen) +{ + if (iter) { + iter->data = data; + iter->slen = (iter->data && slen >= 0) ? slen : -1; + iter->start = -1; + iter->next = (iter->slen >= 0) ? 0 : -1; + iter->error = (iter->slen >= 0) ? 0 : 1; + } +} + +void +utf8IteratorUninit(struct utf8Iterator *iter) +{ + if (iter) { + iter->data = NULL; + iter->slen = -1; + iter->start = iter->next = -1; + } +} + +int +utf8ScanBackwardsForCodePoint(const unsigned char *msg, int len, int pos, + cpUcs4 *out) +{ + cpUcs4 v1; + cpUcs4 v2; + cpUcs4 v3; + cpUcs4 v4; + cpUcs4 x; + int ret; + if (NULL == msg || len < 0 || (unsigned) pos >= (unsigned) len) { + return -__LINE__; + } + if (!out) out = &x; + ret = 0; + if (msg[pos] < 0x80) { + *out = msg[pos]; + return 0; + } else if (msg[pos] < 0xC0) { + if (0 == pos) return -__LINE__; + if (msg[pos-1] >= 0xC1 && msg[pos-1] < 0xF8) { + pos--; + ret = 1; + } else { + if (1 == pos) return -__LINE__; + if ((msg[pos-1] | 0x3F) != 0xBF) return -__LINE__; + if (msg[pos-2] >= 0xE0 && msg[pos-2] < 0xF8) { + pos -= 2; + ret = 2; + } else { + if (2 == pos) return -__LINE__; + if ((msg[pos-2] | 0x3F) != 0xBF) return -__LINE__; + if ((msg[pos-3]|0x07) == 0xF7) { + pos -= 3; + ret = 3; + } else return -__LINE__; + } + } + } + if (msg[pos] < 0xE0) { + if (pos + 1 >= len) return -__LINE__; + if ((msg[pos+1] & 0xC0) != 0x80) return -__LINE__; + v1 = msg[pos] & ~0xE0; + v2 = msg[pos+1] & ~0xC0; + v1 = (v1 << 6) + v2; + if (v1 < 0x80) return -__LINE__; + *out = v1; + return ret; + } + if (msg[pos] < 0xF0) { + if (pos + 2 >= len) return -__LINE__; + if ((msg[pos+1] & 0xC0) != 0x80) return -__LINE__; + if ((msg[pos+2] & 0xC0) != 0x80) return -__LINE__; + v1 = msg[pos] & ~0xF0; + v2 = msg[pos+1] & ~0xC0; + v3 = msg[pos+2] & ~0xC0; + v1 = (v1 << 12) + (v2 << 6) + v3; + if (v1 < 0x800) return -__LINE__; + if (!isLegalUnicodeCodePoint(v1)) return -__LINE__; + *out = v1; + return ret; + } + + if (msg[pos] >= 0xF8) return -__LINE__; + + if (pos + 3 >= len) return -__LINE__; + if ((msg[pos+1] & 0xC0) != 0x80) return -__LINE__; + if ((msg[pos+2] & 0xC0) != 0x80) return -__LINE__; + if ((msg[pos+3] & 0xC0) != 0x80) return -__LINE__; + v1 = msg[pos] & ~0xF8; + v2 = msg[pos+1] & ~0xC0; + v3 = msg[pos+2] & ~0xC0; + v4 = msg[pos+3] & ~0xC0; + v1 = (v1 << 18) + (v2 << 12) + (v3 << 6) + v4; + if (v1 < 0x10000) return -__LINE__; + if (!isLegalUnicodeCodePoint(v1)) return -__LINE__; + *out = v1; + return ret; +} + +/* +Code point UTF-8 +---------- ----- +U-00000000 - U-0000007F: 0xxxxxxx +U-00000080 - U-000007FF: 110xxxxx 10xxxxxx +U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx +U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + +U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx +U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx +*/ + +/* + * Returns next read code point for iterator. + * + * iter->data + iter->start points at the characters just read. + * + * iter->data + iter->next points at the characters that will be read next. + * + * iter->error is boolean indicating whether or not last read contained + * an error. + */ +cpUcs4 +utf8IteratorGetNextCodePoint(struct utf8Iterator *iter, cpUcs4 errCh) +{ + const unsigned char *chrs; + unsigned char c; + unsigned char d; + unsigned char e; + long v; + int i; + int ofs; + int invalid; + + if (NULL == iter || iter->next < 0) return errCh; + if (iter->next >= iter->slen) { + iter->start = iter->slen; + return errCh; + } + if (NULL == iter->data || iter->next < 0 || + utf8IteratorNoMore(iter)) return errCh; + chrs = iter->data + iter->next; + + iter->error = 0; + c = chrs[0]; + ofs = 0; + invalid = 0; + + if (c < 0xC0 || c > 0xFD) { + if (c >= 0x80) { + invalid = 1; + } else { + v = c; + ofs = 1; + } + } else if (c < 0xE0) { + if (iter->next + 1 >= iter->slen) { + invalid = 1; + } else { + v = (c << 6u) - (0x0C0 << 6u); + c = (unsigned char) ((unsigned) chrs[1] - 0x080); + v += c; + if (c >= 0x40 || v < 0x80) { + invalid = 1; + } else { + ofs = 2; + } + } + } else if (c < 0xF0) { + if (iter->next + 2 >= iter->slen) { + invalid = 1; + } else { + v = (c << 12) - (0x0E0 << 12u); + c = (unsigned char) ((unsigned) chrs[1] - 0x080); + d = (unsigned char) ((unsigned) chrs[2] - 0x080); + v += (c << 6u) + d; + if ((c|d) >= 0x40 || v < 0x800 || + !isLegalUnicodeCodePoint(v)) { + invalid = 1; + } else { + ofs = 3; + } + } + } else if (c < 0xF8) { + if (iter->next + 3 >= iter->slen) { + invalid = 1; + } else { + v = (c << 18) - (0x0F0 << 18u); + c = (unsigned char) ((unsigned) chrs[1] - 0x080); + d = (unsigned char) ((unsigned) chrs[2] - 0x080); + e = (unsigned char) ((unsigned) chrs[3] - 0x080); + v += (c << 12u) + (d << 6u) + e; + if ((c|d|e) >= 0x40 || v < 0x10000 || + !isLegalUnicodeCodePoint(v)) { + invalid = 1; + } else { + ofs = 4; + } + } + } else { /* 5 and 6 byte encodings are invalid */ + invalid = 1; + } + + if (invalid) { + iter->error = 1; + v = errCh; + for (i = iter->next+1; i < iter->slen; i++) { + if ((iter->data[i] & 0xC0) != 0x80) break; + } + ofs = i - iter->next; + } + + iter->start = iter->next; + iter->next += ofs; + return v; +} + +/* + * Returns current code point for iterator without advancing. + * + * iter->data + iter->start points at the characters to be read. + * + * iter->data + iter->next points at the characters that will be read next. + * + * iter->error is boolean indicating whether or not last read contained + * an error. + */ +cpUcs4 +utf8IteratorGetCurrCodePoint(struct utf8Iterator *iter, cpUcs4 errCh) +{ + const unsigned char *chrs; + unsigned char c; + unsigned char d; + unsigned char e; + long v; + int invalid; + + if (NULL == iter || iter->next < 0) return errCh; + if (iter->next >= iter->slen) { + iter->start = iter->slen; + return errCh; + } + if (NULL == iter->data || iter->next < 0 || + utf8IteratorNoMore(iter)) return errCh; + chrs = iter->data + iter->next; + + iter->error = 0; + c = chrs[0]; + invalid = 0; + + if (c < 0xC0 || c > 0xFD) { + if (c >= 0x80) { + invalid = 1; + } else { + v = c; + } + } else if (c < 0xE0) { + if (iter->next + 1 >= iter->slen) { + invalid = 1; + } else { + v = (c << 6u) - (0x0C0 << 6u); + c = (unsigned char) ((unsigned) chrs[1] - 0x080); + v += c; + if (c >= 0x40 || v < 0x80) invalid = 1; + } + } else if (c < 0xF0) { + if (iter->next + 2 >= iter->slen) { + invalid = 1; + } else { + v = (c << 12UL) - (0x0E0 << 12u); + c = (unsigned char) ((unsigned) chrs[1] - 0x080); + d = (unsigned char) ((unsigned) chrs[2] - 0x080); + v += (c << 6u) + d; + if ((c|d) >= 0x40 || v < 0x800 || + !isLegalUnicodeCodePoint(v)) invalid = 1; + } + } else if (c < 0xF8) { + if (iter->next + 3 >= iter->slen) { + invalid = 1; + } else { + v = (c << 18UL) - (0x0F0 << 18u); + c = (unsigned char) ((unsigned) chrs[1] - 0x080); + d = (unsigned char) ((unsigned) chrs[2] - 0x080); + e = (unsigned char) ((unsigned) chrs[3] - 0x080); + v += (c << 12UL) + (d << 6u) + e; + if ((c|d|e) >= 0x40 || v < 0x10000 || + !isLegalUnicodeCodePoint(v)) invalid = 1; + } + } else { /* 5 and 6 byte encodings are invalid */ + invalid = 1; + } + + if (invalid) { + iter->error = 1; + v = errCh; + } + return v; +} diff --git a/bstring/utf8util.h b/bstring/utf8util.h new file mode 100644 index 0000000..84aeda0 --- /dev/null +++ b/bstring/utf8util.h @@ -0,0 +1,107 @@ +/* Copyright 2002-2015 Paul Hsieh + * This file is part of Bstrlib. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of bstrlib nor the names of its contributors may be + * used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Alternatively, the contents of this file may be used under the terms of + * GNU General Public License Version 2 (the "GPL"). + */ + +/** + * \file + * \brief Interface for low-level UTF-8 utility functions. + * + * This module is standalone and does not depend on bstrlib. + */ + +#ifndef UTF8_UNICODE_UTILITIES +#define UTF8_UNICODE_UTILITIES + +#include + +/* If bstrlib.h has not been included, define the visibility attribute here. + The #ifndef guard ensures we don't conflict if bstrlib.h came first. */ +#ifndef BSTR_PUBLIC +# if __GNUC__ >= 4 +# define BSTR_PUBLIC __attribute__ ((visibility ("default"))) +# else +# define BSTR_PUBLIC +# endif +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#if INT_MAX >= 0x7fffffffUL +typedef int cpUcs4; +#elif LONG_MAX >= 0x7fffffffUL +typedef long cpUcs4; +#else +#error This compiler is not supported +#endif + +#if UINT_MAX == 0xFFFF +typedef unsigned int cpUcs2; +#elif USHRT_MAX == 0xFFFF +typedef unsigned short cpUcs2; +#elif UCHAR_MAX == 0xFFFF +typedef unsigned char cpUcs2; +#else +#error This compiler is not supported +#endif + +#define isLegalUnicodeCodePoint(v) \ + ((((v) < 0xD800L) || ((v) > 0xDFFFL)) && \ + (((unsigned long)(v)) <= 0x0010FFFFL) && \ + (((v)|0x1F0001) != 0x1FFFFFL)) + +struct utf8Iterator { + unsigned char *data; + int slen; + int start; + int next; + int error; +}; + +#define utf8IteratorNoMore(it) (!(it) || (it)->next >= (it)->slen) + +BSTR_PUBLIC void utf8IteratorInit(struct utf8Iterator *iter, + unsigned char *data, int slen); +BSTR_PUBLIC void utf8IteratorUninit(struct utf8Iterator *iter); +BSTR_PUBLIC cpUcs4 utf8IteratorGetNextCodePoint(struct utf8Iterator *iter, + cpUcs4 errCh); +BSTR_PUBLIC cpUcs4 utf8IteratorGetCurrCodePoint(struct utf8Iterator *iter, + cpUcs4 errCh); +BSTR_PUBLIC int utf8ScanBackwardsForCodePoint(const unsigned char *msg, + int len, int pos, cpUcs4 *out); + +#ifdef __cplusplus +} +#endif + +#endif /* UTF8_UNICODE_UTILITIES */ diff --git a/doc/introduction.md b/doc/introduction.md index c140d48..11f8ec7 100644 --- a/doc/introduction.md +++ b/doc/introduction.md @@ -327,7 +327,7 @@ object in a multithreaded environment. Problems Not Solved ------------------- -Bstrlib is written for the C languages, which have inherent weaknesses that +Bstrlib is written for the C language, which has inherent weaknesses that cannot be easily solved: 1. Memory leaks: Forgetting to call `bdestroy` on a bstring that is about to @@ -349,6 +349,29 @@ Other problems not addressed: > Note: except for spotty support of wide characters, the default C standard library does not address any of these problems either. +Unicode functions +----------------- + +The two modules utf8util.c and buniutil.c implement basic functions for +parsing and collecting Unicode data in the UTF8 format. Unicode is +described by a sequence of "code points" which are values between 0 and +1114111 inclusive mapped to symbol content corresponding to nearly all +the standardized scripts of the world. + +The semantics of Unicode code points is varied and complicated. The +base support of the better string library does not attempt to perform +any interpretation of these code points. The better string library +solely provides support for iterating through unicode code points, +appending and extracting code points to and from bstrings, and parsing +UTF8 and UTF16 from raw data. + +The types cpUcs4 and cpUcs2 respectively are defined as 4 byte and 2 byte +encoding formats corresponding to UCS4 and UCS2 respectively. To test +if a raw code point is valid, the macro isLegalUnicodeCodePoint() has +been defined. The utf8 iterator is defined by struct utf8Iterator. To +test if the iterator has more code points to walk through the macro +utf8IteratorNoMore() has been defined. + The `bstest` Module ------------------- @@ -871,3 +894,4 @@ and testing of the Better String Library: * Richard A. Smith * Simon Ekstrom * Wayne Scott +* Zed A. Shaw diff --git a/meson_options.txt b/meson_options.txt index 6b0b4fd..5bf78df 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -16,3 +16,9 @@ option( value: false, description: 'Build unit tests', ) +option( + 'enable-utf8', + type: 'boolean', + value: true, + description: 'Build bstring library with UTF-8 support', +) diff --git a/tests/meson.build b/tests/meson.build index 66b94bf..28ac1c0 100644 --- a/tests/meson.build +++ b/tests/meson.build @@ -15,3 +15,15 @@ test_executable_aux = executable( test('bstring unit tests', test_executable) test('bstring auxiliary unit tests', test_executable_aux) + +if get_option('enable-utf8') + test_executable_utf8 = executable( + 'testutf8', + 'testutf8.c', + link_with: libbstring, + include_directories: bstring_inc, + dependencies: check, + ) + + test('bstring UTF-8 unit tests', test_executable_utf8) +endif diff --git a/tests/testutf8.c b/tests/testutf8.c new file mode 100644 index 0000000..792f737 --- /dev/null +++ b/tests/testutf8.c @@ -0,0 +1,904 @@ +/* Copyright (C) 2026 Daniel Markstedt + * UTF-8 unit tests for the Better String Library + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of bstrlib nor the names of its contributors may be + * used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Alternatively, the contents of this file may be used under the terms of + * GNU General Public License Version 2 (the "GPL"). + */ + +/* + * This file is the C unit test for the UTF-8 modules (utf8util, buniutil). + * + * Test data quick reference: + * U+0041 'A' = 0x41 (1-byte ASCII) + * U+00A9 '©' = 0xC2 0xA9 (2-byte) + * U+20AC '€' = 0xE2 0x82 0xAC (3-byte) + * U+1F600 '😀' = 0xF0 0x9F 0x98 0x80 (4-byte) + * UTF-16 U+1F600 = { 0xD83D, 0xDE00 } (surrogate pair) + */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include "buniutil.h" +#include "bstrlib.h" +#include "utf8util.h" +#include +#include +#include + +/* ----------------------------------------------------------------------- + * core_000: utf8IteratorInit — valid inputs and error inputs + * ----------------------------------------------------------------------- */ +START_TEST(core_000) +{ + struct utf8Iterator iter; + unsigned char data[] = "Hello"; + + /* NULL iter pointer must not crash */ + utf8IteratorInit(NULL, data, 5); + + /* Valid initialisation */ + utf8IteratorInit(&iter, data, 5); + ck_assert_int_eq(iter.slen, 5); + ck_assert_int_eq(iter.next, 0); + ck_assert_int_eq(iter.start, -1); + ck_assert_int_eq(iter.error, 0); + ck_assert(iter.data == data); + + /* NULL data → sentinel values */ + utf8IteratorInit(&iter, NULL, 5); + ck_assert_int_eq(iter.slen, -1); + ck_assert_int_eq(iter.next, -1); + ck_assert_int_eq(iter.error, 1); + + /* Negative slen → sentinel values */ + utf8IteratorInit(&iter, data, -1); + ck_assert_int_eq(iter.slen, -1); + ck_assert_int_eq(iter.next, -1); + ck_assert_int_eq(iter.error, 1); + + /* Zero-length string is valid */ + utf8IteratorInit(&iter, data, 0); + ck_assert_int_eq(iter.slen, 0); + ck_assert_int_eq(iter.next, 0); + ck_assert_int_eq(iter.error, 0); +} +END_TEST + +/* ----------------------------------------------------------------------- + * core_001: utf8IteratorUninit — clears all fields; handles NULL gracefully + * ----------------------------------------------------------------------- */ +START_TEST(core_001) +{ + struct utf8Iterator iter; + unsigned char data[] = "Hello"; + + utf8IteratorInit(&iter, data, 5); + utf8IteratorUninit(&iter); + ck_assert(iter.data == NULL); + ck_assert_int_eq(iter.slen, -1); + ck_assert_int_eq(iter.start, -1); + ck_assert_int_eq(iter.next, -1); + + /* NULL pointer must not crash */ + utf8IteratorUninit(NULL); +} +END_TEST + +/* ----------------------------------------------------------------------- + * core_002: utf8IteratorGetNextCodePoint — ASCII string iteration + * ----------------------------------------------------------------------- */ +START_TEST(core_002) +{ + struct utf8Iterator iter; + unsigned char data[] = "ABC"; + cpUcs4 cp; + + utf8IteratorInit(&iter, data, 3); + + cp = utf8IteratorGetNextCodePoint(&iter, '?'); + ck_assert_int_eq(cp, 'A'); + ck_assert_int_eq(iter.error, 0); + ck_assert_int_eq(iter.start, 0); + ck_assert_int_eq(iter.next, 1); + + cp = utf8IteratorGetNextCodePoint(&iter, '?'); + ck_assert_int_eq(cp, 'B'); + ck_assert_int_eq(iter.start, 1); + ck_assert_int_eq(iter.next, 2); + + cp = utf8IteratorGetNextCodePoint(&iter, '?'); + ck_assert_int_eq(cp, 'C'); + ck_assert_int_eq(iter.start, 2); + ck_assert_int_eq(iter.next, 3); + + /* Past end → errCh */ + cp = utf8IteratorGetNextCodePoint(&iter, '?'); + ck_assert_int_eq(cp, '?'); + + /* NULL iterator → errCh */ + cp = utf8IteratorGetNextCodePoint(NULL, '?'); + ck_assert_int_eq(cp, '?'); +} +END_TEST + +/* ----------------------------------------------------------------------- + * core_003: utf8IteratorGetNextCodePoint — multi-byte sequences + * + * Sequence: © (U+00A9, 2-byte) € (U+20AC, 3-byte) 😀 (U+1F600, 4-byte) + * Bytes: C2 A9 E2 82 AC F0 9F 98 80 + * ----------------------------------------------------------------------- */ +START_TEST(core_003) +{ + struct utf8Iterator iter; + /* © € 😀 */ + unsigned char data[] = { + 0xC2, 0xA9, /* U+00A9 © */ + 0xE2, 0x82, 0xAC, /* U+20AC € */ + 0xF0, 0x9F, 0x98, 0x80 /* U+1F600 😀 */ + }; + cpUcs4 cp; + + utf8IteratorInit(&iter, data, sizeof(data)); + + cp = utf8IteratorGetNextCodePoint(&iter, '?'); + ck_assert_int_eq(cp, 0x00A9); + ck_assert_int_eq(iter.error, 0); + ck_assert_int_eq(iter.start, 0); + ck_assert_int_eq(iter.next, 2); + + cp = utf8IteratorGetNextCodePoint(&iter, '?'); + ck_assert_int_eq(cp, 0x20AC); + ck_assert_int_eq(iter.error, 0); + ck_assert_int_eq(iter.start, 2); + ck_assert_int_eq(iter.next, 5); + + cp = utf8IteratorGetNextCodePoint(&iter, '?'); + ck_assert_int_eq(cp, 0x1F600); + ck_assert_int_eq(iter.error, 0); + ck_assert_int_eq(iter.start, 5); + ck_assert_int_eq(iter.next, 9); + + /* Exhausted */ + cp = utf8IteratorGetNextCodePoint(&iter, '?'); + ck_assert_int_eq(cp, '?'); +} +END_TEST + +/* ----------------------------------------------------------------------- + * core_004: utf8IteratorGetNextCodePoint — invalid byte sequences + * + * 0x80 alone is a stray continuation byte (invalid lead). + * 0xFF is never valid in UTF-8. + * ----------------------------------------------------------------------- */ +START_TEST(core_004) +{ + struct utf8Iterator iter; + /* stray continuation, then a valid ASCII char */ + unsigned char data_cont[] = { 0x80, 0x41 }; + /* 0xFF is always invalid */ + unsigned char data_ff[] = { 0xFF, 0x41 }; + cpUcs4 cp; + + /* Stray continuation byte → error, iterator skips to next valid lead */ + utf8IteratorInit(&iter, data_cont, sizeof(data_cont)); + cp = utf8IteratorGetNextCodePoint(&iter, '?'); + ck_assert_int_eq(cp, '?'); + ck_assert_int_eq(iter.error, 1); + /* After error the iterator should have advanced past the bad byte(s) */ + ck_assert(iter.next > 0); + + /* 0xFF lead byte → error */ + utf8IteratorInit(&iter, data_ff, sizeof(data_ff)); + cp = utf8IteratorGetNextCodePoint(&iter, '?'); + ck_assert_int_eq(cp, '?'); + ck_assert_int_eq(iter.error, 1); +} +END_TEST + +/* ----------------------------------------------------------------------- + * core_005: utf8IteratorGetNextCodePoint — truncated sequence bounds checks + * + * The backing arrays contain full valid code points, but slen is set so the + * sequence is truncated at the end. Iterator must treat each as invalid and + * return errCh instead of decoding bytes past slen. + * ----------------------------------------------------------------------- */ +START_TEST(core_005) +{ + struct utf8Iterator iter; + cpUcs4 cp; + + { + unsigned char data[] = { 0xC2, 0xA9 }; + utf8IteratorInit(&iter, data, 1); + cp = utf8IteratorGetNextCodePoint(&iter, '?'); + ck_assert_int_eq(cp, '?'); + ck_assert_int_eq(iter.error, 1); + ck_assert_int_eq(iter.start, 0); + ck_assert_int_eq(iter.next, 1); + } + + { + unsigned char data[] = { 0xE2, 0x82, 0xAC }; + utf8IteratorInit(&iter, data, 2); + cp = utf8IteratorGetNextCodePoint(&iter, '?'); + ck_assert_int_eq(cp, '?'); + ck_assert_int_eq(iter.error, 1); + ck_assert_int_eq(iter.start, 0); + ck_assert_int_eq(iter.next, 2); + } + + { + unsigned char data[] = { 0xF0, 0x9F, 0x98, 0x80 }; + utf8IteratorInit(&iter, data, 3); + cp = utf8IteratorGetNextCodePoint(&iter, '?'); + ck_assert_int_eq(cp, '?'); + ck_assert_int_eq(iter.error, 1); + ck_assert_int_eq(iter.start, 0); + ck_assert_int_eq(iter.next, 3); + } +} +END_TEST + +/* ----------------------------------------------------------------------- + * core_006: utf8IteratorGetCurrCodePoint — truncated sequence bounds checks + * + * Peek must never decode bytes beyond slen. For each truncated sequence, it + * should return errCh, set iter.error, and leave iter.next unchanged. + * ----------------------------------------------------------------------- */ +START_TEST(core_006) +{ + struct utf8Iterator iter; + cpUcs4 cp; + + { + unsigned char data[] = { 0xC2, 0xA9 }; + utf8IteratorInit(&iter, data, 1); + cp = utf8IteratorGetCurrCodePoint(&iter, '?'); + ck_assert_int_eq(cp, '?'); + ck_assert_int_eq(iter.error, 1); + ck_assert_int_eq(iter.next, 0); + } + + { + unsigned char data[] = { 0xE2, 0x82, 0xAC }; + utf8IteratorInit(&iter, data, 2); + cp = utf8IteratorGetCurrCodePoint(&iter, '?'); + ck_assert_int_eq(cp, '?'); + ck_assert_int_eq(iter.error, 1); + ck_assert_int_eq(iter.next, 0); + } + + { + unsigned char data[] = { 0xF0, 0x9F, 0x98, 0x80 }; + utf8IteratorInit(&iter, data, 3); + cp = utf8IteratorGetCurrCodePoint(&iter, '?'); + ck_assert_int_eq(cp, '?'); + ck_assert_int_eq(iter.error, 1); + ck_assert_int_eq(iter.next, 0); + } +} +END_TEST + +/* ----------------------------------------------------------------------- + * core_007: utf8IteratorGetCurrCodePoint — peek without advancing + * ----------------------------------------------------------------------- */ +START_TEST(core_007) +{ + struct utf8Iterator iter; + unsigned char data[] = { 0xC2, 0xA9, 0x41 }; /* © A */ + cpUcs4 cp; + + utf8IteratorInit(&iter, data, sizeof(data)); + + /* Peek twice at the same position — must not advance */ + cp = utf8IteratorGetCurrCodePoint(&iter, '?'); + ck_assert_int_eq(cp, 0x00A9); + ck_assert_int_eq(iter.next, 0); /* still at start */ + + cp = utf8IteratorGetCurrCodePoint(&iter, '?'); + ck_assert_int_eq(cp, 0x00A9); + ck_assert_int_eq(iter.next, 0); + + /* Now advance with GetNext, then peek the next char */ + utf8IteratorGetNextCodePoint(&iter, '?'); + ck_assert_int_eq(iter.next, 2); + + cp = utf8IteratorGetCurrCodePoint(&iter, '?'); + ck_assert_int_eq(cp, 0x41); /* 'A' */ + ck_assert_int_eq(iter.next, 2); /* still not advanced */ + + /* NULL iterator → errCh */ + cp = utf8IteratorGetCurrCodePoint(NULL, '?'); + ck_assert_int_eq(cp, '?'); +} +END_TEST + +/* ----------------------------------------------------------------------- + * core_008: utf8ScanBackwardsForCodePoint — various positions + * + * Data: © (0xC2 0xA9) at bytes 0-1, then 'A' (0x41) at byte 2 + * ----------------------------------------------------------------------- */ +START_TEST(core_008) +{ + unsigned char data[] = { 0xC2, 0xA9, 0x41 }; /* © A */ + cpUcs4 out; + int ret; + + /* pos=0 is the lead byte of © — ret=0, out=0xA9 */ + ret = utf8ScanBackwardsForCodePoint(data, 3, 0, &out); + ck_assert_int_eq(ret, 0); + ck_assert_int_eq(out, 0x00A9); + + /* pos=1 is the continuation byte — ret=1 (1 byte back to lead), out=0xA9 */ + ret = utf8ScanBackwardsForCodePoint(data, 3, 1, &out); + ck_assert_int_eq(ret, 1); + ck_assert_int_eq(out, 0x00A9); + + /* pos=2 is ASCII 'A' — ret=0, out='A' */ + ret = utf8ScanBackwardsForCodePoint(data, 3, 2, &out); + ck_assert_int_eq(ret, 0); + ck_assert_int_eq(out, 0x41); + + /* NULL msg → error (negative) */ + ret = utf8ScanBackwardsForCodePoint(NULL, 3, 0, &out); + ck_assert(ret < 0); + + /* pos out of range → error */ + ret = utf8ScanBackwardsForCodePoint(data, 3, 3, &out); + ck_assert(ret < 0); + + /* out=NULL is accepted; return value indicates success/failure */ + ret = utf8ScanBackwardsForCodePoint(data, 3, 2, NULL); + ck_assert_int_eq(ret, 0); +} +END_TEST + +/* ----------------------------------------------------------------------- + * core_009: buIsUTF8Content + * ----------------------------------------------------------------------- */ +START_TEST(core_009) +{ + bstring b; + int ret; + + /* NULL bstring → 0 */ + ret = buIsUTF8Content(NULL); + ck_assert_int_eq(ret, 0); + + /* Empty string → 1 (vacuously true) */ + b = bfromcstr(""); + ck_assert(b != NULL); + ret = buIsUTF8Content(b); + ck_assert_int_eq(ret, 1); + bdestroy(b); + + /* Pure ASCII → 1 */ + b = bfromcstr("Hello, world!"); + ck_assert(b != NULL); + ret = buIsUTF8Content(b); + ck_assert_int_eq(ret, 1); + bdestroy(b); + + /* Valid multi-byte UTF-8: © € 😀 */ + { + unsigned char utf8[] = { + 0xC2, 0xA9, + 0xE2, 0x82, 0xAC, + 0xF0, 0x9F, 0x98, 0x80 + }; + b = blk2bstr(utf8, sizeof(utf8)); + ck_assert(b != NULL); + ret = buIsUTF8Content(b); + ck_assert_int_eq(ret, 1); + bdestroy(b); + } + + /* Invalid: stray 0x80 continuation byte → 0 */ + { + unsigned char bad[] = { 0x41, 0x80, 0x41 }; + b = blk2bstr(bad, sizeof(bad)); + ck_assert(b != NULL); + ret = buIsUTF8Content(b); + ck_assert_int_eq(ret, 0); + bdestroy(b); + } + + /* Invalid: truncated 2-byte sequence → 0 */ + { + unsigned char bad[] = { 0xC2 }; /* lead without continuation */ + b = blk2bstr(bad, sizeof(bad)); + ck_assert(b != NULL); + ret = buIsUTF8Content(b); + ck_assert_int_eq(ret, 0); + bdestroy(b); + } +} +END_TEST + +/* ----------------------------------------------------------------------- + * core_010: buAppendBlkUcs4 — UCS-4 array → UTF-8 bstring + * ----------------------------------------------------------------------- */ +START_TEST(core_010) +{ + bstring b; + int ret; + + /* NULL arguments → BSTR_ERR */ + b = bfromcstr(""); + ck_assert(b != NULL); + ret = buAppendBlkUcs4(NULL, NULL, 0, '?'); + ck_assert_int_eq(ret, BSTR_ERR); + ret = buAppendBlkUcs4(b, NULL, 1, '?'); + ck_assert_int_eq(ret, BSTR_ERR); + bdestroy(b); + + /* ASCII code points */ + { + cpUcs4 pts[] = { 'H', 'i' }; + b = bfromcstr(""); + ck_assert(b != NULL); + ret = buAppendBlkUcs4(b, pts, 2, '?'); + ck_assert_int_eq(ret, BSTR_OK); + ck_assert_int_eq(b->slen, 2); + ck_assert_int_eq(b->data[0], 'H'); + ck_assert_int_eq(b->data[1], 'i'); + bdestroy(b); + } + + /* Mixed: © (U+00A9) and € (U+20AC) */ + { + cpUcs4 pts[] = { 0x00A9, 0x20AC }; + unsigned char expected[] = { + 0xC2, 0xA9, /* © */ + 0xE2, 0x82, 0xAC /* € */ + }; + b = bfromcstr(""); + ck_assert(b != NULL); + ret = buAppendBlkUcs4(b, pts, 2, '?'); + ck_assert_int_eq(ret, BSTR_OK); + ck_assert_int_eq(b->slen, 5); + ret = memcmp(b->data, expected, 5); + ck_assert_int_eq(ret, 0); + bdestroy(b); + } + + /* 4-byte: 😀 (U+1F600) */ + { + cpUcs4 pts[] = { 0x1F600 }; + unsigned char expected[] = { 0xF0, 0x9F, 0x98, 0x80 }; + b = bfromcstr(""); + ck_assert(b != NULL); + ret = buAppendBlkUcs4(b, pts, 1, '?'); + ck_assert_int_eq(ret, BSTR_OK); + ck_assert_int_eq(b->slen, 4); + ret = memcmp(b->data, expected, 4); + ck_assert_int_eq(ret, 0); + bdestroy(b); + } + + /* Invalid code point with valid errCh → substituted */ + { + cpUcs4 pts[] = { 0xD800 }; /* surrogates are illegal */ + b = bfromcstr(""); + ck_assert(b != NULL); + ret = buAppendBlkUcs4(b, pts, 1, '?'); + ck_assert_int_eq(ret, BSTR_OK); + ck_assert_int_eq(b->slen, 1); + ck_assert_int_eq(b->data[0], '?'); + bdestroy(b); + } + + /* Invalid code point with invalid errCh → BSTR_ERR, bstring unchanged */ + { + cpUcs4 pts[] = { 0xD800 }; + b = bfromcstr("pre"); + ck_assert(b != NULL); + ret = buAppendBlkUcs4(b, pts, 1, 0xD800); /* errCh also invalid */ + ck_assert_int_eq(ret, BSTR_ERR); + /* slen must be rolled back */ + ck_assert_int_eq(b->slen, 3); + bdestroy(b); + } + + /* Zero-length array → BSTR_OK, nothing appended */ + { + cpUcs4 pts[] = { 'X' }; + b = bfromcstr("pre"); + ck_assert(b != NULL); + ret = buAppendBlkUcs4(b, pts, 0, '?'); + ck_assert_int_eq(ret, BSTR_OK); + ck_assert_int_eq(b->slen, 3); + bdestroy(b); + } +} +END_TEST + +/* ----------------------------------------------------------------------- + * core_011: buGetBlkUTF16 — UTF-8 bstring → UTF-16 array + * ----------------------------------------------------------------------- */ +START_TEST(core_011) +{ + cpUcs2 buf[16]; + int ret; + + /* NULL arguments → BSTR_ERR */ + { + unsigned char raw[] = { 0x41 }; + bstring b = blk2bstr(raw, 1); + ck_assert(b != NULL); + ret = buGetBlkUTF16(NULL, 4, '?', b, 0); + ck_assert_int_eq(ret, BSTR_ERR); + ret = buGetBlkUTF16(buf, 0, '?', b, 0); + ck_assert_int_eq(ret, BSTR_ERR); + ret = buGetBlkUTF16(buf, 4, '?', NULL, 0); + ck_assert_int_eq(ret, BSTR_ERR); + ret = buGetBlkUTF16(buf, 4, '?', b, -1); + ck_assert_int_eq(ret, BSTR_ERR); + bdestroy(b); + } + + /* ASCII "AB" → UTF-16 { 0x0041, 0x0042, 0, ... } */ + { + bstring b = bfromcstr("AB"); + ck_assert(b != NULL); + memset(buf, 0xFF, sizeof(buf)); + ret = buGetBlkUTF16(buf, 4, '?', b, 0); + ck_assert_int_eq(ret, 2); + ck_assert_int_eq(buf[0], 0x0041); + ck_assert_int_eq(buf[1], 0x0042); + ck_assert_int_eq(buf[2], 0); /* null-padded */ + bdestroy(b); + } + + /* © € → UTF-16 BMP values (U+00A9, U+20AC) */ + { + unsigned char raw[] = { + 0xC2, 0xA9, + 0xE2, 0x82, 0xAC + }; + bstring b = blk2bstr(raw, sizeof(raw)); + ck_assert(b != NULL); + memset(buf, 0xFF, sizeof(buf)); + ret = buGetBlkUTF16(buf, 4, '?', b, 0); + ck_assert_int_eq(ret, 2); + ck_assert_int_eq(buf[0], 0x00A9); + ck_assert_int_eq(buf[1], 0x20AC); + bdestroy(b); + } + + /* pos=1 skips first code point */ + { + bstring b = bfromcstr("AB"); + ck_assert(b != NULL); + memset(buf, 0, sizeof(buf)); + ret = buGetBlkUTF16(buf, 4, '?', b, 1); + ck_assert_int_eq(ret, 1); + ck_assert_int_eq(buf[0], 0x0042); + bdestroy(b); + } + + /* Supplementary character 😀 (U+1F600) → surrogate pair */ + { + unsigned char raw[] = { 0xF0, 0x9F, 0x98, 0x80 }; + bstring b = blk2bstr(raw, sizeof(raw)); + ck_assert(b != NULL); + memset(buf, 0, sizeof(buf)); + ret = buGetBlkUTF16(buf, 4, '?', b, 0); + ck_assert_int_eq(ret, 2); /* one code point → two UTF-16 units */ + ck_assert_int_eq(buf[0], 0xD83D); /* high surrogate */ + ck_assert_int_eq(buf[1], 0xDE00); /* low surrogate */ + bdestroy(b); + } +} +END_TEST + +/* ----------------------------------------------------------------------- + * core_012: buAppendBlkUTF16 — UTF-16 array → UTF-8 bstring + * ----------------------------------------------------------------------- */ +START_TEST(core_012) +{ + bstring b; + int ret; + + /* NULL / bad arguments → BSTR_ERR */ + b = bfromcstr(""); + ck_assert(b != NULL); + { + cpUcs2 u[] = { 0x0041 }; + ret = buAppendBlkUTF16(NULL, u, 1, NULL, '?'); + ck_assert_int_eq(ret, BSTR_ERR); + ret = buAppendBlkUTF16(b, NULL, 1, NULL, '?'); + ck_assert_int_eq(ret, BSTR_ERR); + ret = buAppendBlkUTF16(b, u, -1, NULL, '?'); + ck_assert_int_eq(ret, BSTR_ERR); + } + bdestroy(b); + + /* Zero-length input → BSTR_OK, nothing appended */ + { + cpUcs2 u[] = { 0x0041 }; + b = bfromcstr("pre"); + ck_assert(b != NULL); + ret = buAppendBlkUTF16(b, u, 0, NULL, '?'); + ck_assert_int_eq(ret, BSTR_OK); + ck_assert_int_eq(b->slen, 3); + bdestroy(b); + } + + /* ASCII "AB" in UTF-16 → "AB" in UTF-8 */ + { + cpUcs2 u[] = { 0x0041, 0x0042 }; + b = bfromcstr(""); + ck_assert(b != NULL); + ret = buAppendBlkUTF16(b, u, 2, NULL, '?'); + ck_assert_int_eq(ret, BSTR_OK); + ck_assert_int_eq(b->slen, 2); + ck_assert_int_eq(b->data[0], 'A'); + ck_assert_int_eq(b->data[1], 'B'); + bdestroy(b); + } + + /* BMP characters: U+00A9 © and U+20AC € */ + { + cpUcs2 u[] = { 0x00A9, 0x20AC }; + unsigned char expected[] = { + 0xC2, 0xA9, + 0xE2, 0x82, 0xAC + }; + b = bfromcstr(""); + ck_assert(b != NULL); + ret = buAppendBlkUTF16(b, u, 2, NULL, '?'); + ck_assert_int_eq(ret, BSTR_OK); + ck_assert_int_eq(b->slen, 5); + ret = memcmp(b->data, expected, 5); + ck_assert_int_eq(ret, 0); + bdestroy(b); + } + + /* Surrogate pair: 😀 (U+1F600) = { 0xD83D, 0xDE00 } */ + { + cpUcs2 u[] = { 0xD83D, 0xDE00 }; + unsigned char expected[] = { 0xF0, 0x9F, 0x98, 0x80 }; + b = bfromcstr(""); + ck_assert(b != NULL); + ret = buAppendBlkUTF16(b, u, 2, NULL, '?'); + ck_assert_int_eq(ret, BSTR_OK); + ck_assert_int_eq(b->slen, 4); + ret = memcmp(b->data, expected, 4); + ck_assert_int_eq(ret, 0); + bdestroy(b); + } + + /* Little-endian BOM (0xFFFE) → byte-swapped input */ + { + /* 'A' (0x0041) with bytes swapped = 0x4100, plus LE BOM */ + cpUcs2 u[] = { 0xFFFE, 0x4100 }; + b = bfromcstr(""); + ck_assert(b != NULL); + ret = buAppendBlkUTF16(b, u, 2, NULL, '?'); + ck_assert_int_eq(ret, BSTR_OK); + ck_assert_int_eq(b->slen, 1); + ck_assert_int_eq(b->data[0], 'A'); + bdestroy(b); + } + + /* Big-endian BOM (0xFEFF) is consumed and removed from output */ + { + cpUcs2 u[] = { 0xFEFF, 0x0041 }; + b = bfromcstr(""); + ck_assert(b != NULL); + ret = buAppendBlkUTF16(b, u, 2, NULL, '?'); + ck_assert_int_eq(ret, BSTR_OK); + ck_assert_int_eq(b->slen, 1); + ck_assert_int_eq(b->data[0], 'A'); + bdestroy(b); + } + + /* Invalid low surrogate alone with valid errCh → substituted */ + { + cpUcs2 u[] = { 0xDC00 }; + b = bfromcstr(""); + ck_assert(b != NULL); + ret = buAppendBlkUTF16(b, u, 1, NULL, '?'); + ck_assert_int_eq(ret, BSTR_OK); + ck_assert_int_eq(b->slen, 1); + ck_assert_int_eq(b->data[0], '?'); + bdestroy(b); + } + + /* Invalid low surrogate then ASCII with valid errCh */ + { + cpUcs2 u[] = { 0xDC00, 0x0041 }; + unsigned char expected[] = { '?', 'A' }; + b = bfromcstr(""); + ck_assert(b != NULL); + ret = buAppendBlkUTF16(b, u, 2, NULL, '?'); + ck_assert_int_eq(ret, BSTR_OK); + ck_assert_int_eq(b->slen, 2); + ret = memcmp(b->data, expected, 2); + ck_assert_int_eq(ret, 0); + bdestroy(b); + } + + /* Invalid surrogate with invalid errCh → BSTR_ERR and rollback */ + { + cpUcs2 u[] = { 0xDC00 }; + b = bfromcstr("pre"); + ck_assert(b != NULL); + ret = buAppendBlkUTF16(b, u, 1, NULL, 0xD800); /* invalid errCh */ + ck_assert_int_eq(ret, BSTR_ERR); + ck_assert_int_eq(b->slen, 3); /* unchanged */ + ck_assert_int_eq(b->data[0], 'p'); + ck_assert_int_eq(b->data[1], 'r'); + ck_assert_int_eq(b->data[2], 'e'); + bdestroy(b); + } + + /* bom out-parameter gets set when BOM appears in stream */ + { + cpUcs2 in_bom = 0; + cpUcs2 u[] = { 0xFEFF, 0x0041 }; + b = bfromcstr(""); + ck_assert(b != NULL); + ret = buAppendBlkUTF16(b, u, 2, &in_bom, '?'); + ck_assert_int_eq(ret, BSTR_OK); + ck_assert_int_eq(in_bom, 0xFEFF); + ck_assert_int_eq(b->slen, 1); + ck_assert_int_eq(b->data[0], 'A'); + bdestroy(b); + } + + /* Pre-seeded bom controls endianness even without BOM in input */ + { + cpUcs2 in_bom = 0xFFFE; + cpUcs2 u[] = { 0x4100 }; /* bytes for 0x0041 in opposite endian */ + b = bfromcstr(""); + ck_assert(b != NULL); + ret = buAppendBlkUTF16(b, u, 1, &in_bom, '?'); + ck_assert_int_eq(ret, BSTR_OK); + ck_assert_int_eq(in_bom, 0xFFFE); /* preserved */ + ck_assert_int_eq(b->slen, 1); + ck_assert_int_eq(b->data[0], 'A'); + bdestroy(b); + } + + /* Larger than TEMP_UCS4_BUFFER_SIZE exercises internal flush path */ + { + cpUcs2 u[80]; + for (int j = 0; j < 80; j++) { + u[j] = (cpUcs2)('A' + (j % 26)); + } + b = bfromcstr(""); + ck_assert(b != NULL); + ret = buAppendBlkUTF16(b, u, 80, NULL, '?'); + ck_assert_int_eq(ret, BSTR_OK); + ck_assert_int_eq(b->slen, 80); + for (int j = 0; j < 80; j++) { + ck_assert_int_eq(b->data[j], 'A' + (j % 26)); + } + bdestroy(b); + } +} +END_TEST + +/* ----------------------------------------------------------------------- + * core_013: regression guard + * + * Guard against regressions for: + * high surrogate followed by non-low surrogate. + * + * Expected behavior is: + * first code unit substituted with errCh, second processed normally. + * ----------------------------------------------------------------------- */ +START_TEST(core_013) +{ + bstring b; + int ret; + + { + cpUcs2 u[] = { 0xD83D, 0x0041 }; + unsigned char expected[] = { '?', 'A' }; + b = bfromcstr(""); + ck_assert(b != NULL); + ret = buAppendBlkUTF16(b, u, 2, NULL, '?'); + ck_assert_int_eq(ret, BSTR_OK); + ck_assert_int_eq(b->slen, 2); + ret = memcmp(b->data, expected, 2); + ck_assert_int_eq(ret, 0); + bdestroy(b); + } +} +END_TEST + +/* ----------------------------------------------------------------------- + * core_013: utf8ScanBackwardsForCodePoint — invalid continuation bytes + * + * Each case starts at a lead byte but includes one or more non-continuation + * trailing bytes. Scanner must reject these and return an error. + * ----------------------------------------------------------------------- */ +START_TEST(core_014) +{ + cpUcs4 out = 0; + int ret; + + /* Invalid 2-byte sequence: second byte must be 10xxxxxx */ + { + unsigned char data[] = { 0xC2, 0x41 }; + ret = utf8ScanBackwardsForCodePoint(data, 2, 0, &out); + ck_assert(ret < 0); + } + + /* Invalid 3-byte sequence: middle byte must be 10xxxxxx */ + { + unsigned char data[] = { 0xE2, 0x28, 0xAC }; + ret = utf8ScanBackwardsForCodePoint(data, 3, 0, &out); + ck_assert(ret < 0); + } + + /* Invalid 4-byte sequence: third byte must be 10xxxxxx */ + { + unsigned char data[] = { 0xF0, 0x9F, 0x41, 0x80 }; + ret = utf8ScanBackwardsForCodePoint(data, 4, 0, &out); + ck_assert(ret < 0); + } +} +END_TEST + +int +main(void) +{ + /* Build test suite */ + Suite *suite = suite_create("bstr-utf8"); + /* Core tests */ + TCase *core = tcase_create("Core"); + tcase_add_test(core, core_000); + tcase_add_test(core, core_001); + tcase_add_test(core, core_002); + tcase_add_test(core, core_003); + tcase_add_test(core, core_004); + tcase_add_test(core, core_005); + tcase_add_test(core, core_006); + tcase_add_test(core, core_007); + tcase_add_test(core, core_008); + tcase_add_test(core, core_009); + tcase_add_test(core, core_010); + tcase_add_test(core, core_011); + tcase_add_test(core, core_012); + tcase_add_test(core, core_013); + tcase_add_test(core, core_014); + suite_add_tcase(suite, core); + /* Run tests */ + SRunner *runner = srunner_create(suite); + srunner_run_all(runner, CK_ENV); + int number_failed = srunner_ntests_failed(runner); + srunner_free(runner); + return (0 == number_failed) ? EXIT_SUCCESS : EXIT_FAILURE; +}