A text library.
Usage
Character Encoding Conversion
C++
#include <cassert>
#include <string>
namespace usage_tetengo::text
{
constexpr char8_t operator""_u8c(unsigned long long value)
{
return static_cast<char8_t>(value);
}
void encoding()
{
static const std::u8string utf8{
0xF0_u8c, 0x9F_u8c, 0x9A_u8c, 0x85_u8c,
0xF0_u8c, 0x9F_u8c, 0x90_u8c, 0xBB_u8c,
0xE2_u8c, 0x80_u8c, 0x8D_u8c,
0xE2_u8c, 0x9D_u8c, 0x84_u8c,
0xEF_u8c, 0xB8_u8c, 0x8F_u8c,
0xE2_u8c, 0x9D_u8c, 0x84_u8c,
};
static const std::u16string utf16{
0xD83D, 0xDE85,
0xD83D, 0xDC3B,
0x200D,
0x2744,
0xFE0F,
0x2744,
};
const auto internal_encoding = utf8_encoder_type::instance().
decode(utf8);
const auto encoded_utf16 = utf16_encoder_type::instance().
encode(internal_encoding);
assert(encoded_utf16 == utf16);
}
}
C
#include <assert.h>
#include <stddef.h>
#include <stdlib.h>
static int equal(const unsigned short* string1, const unsigned short* string2);
void usage_tetengo_text_encoding()
{
static const char utf8[] = {
(char)(unsigned char)0xF0, (char)(unsigned char)0x9F, (char)(unsigned char)0x9A, (char)(unsigned char)0x85,
(char)(unsigned char)0xF0, (char)(unsigned char)0x9F, (char)(unsigned char)0x90, (char)(unsigned char)0xBB,
(char)(unsigned char)0xE2, (char)(unsigned char)0x80, (char)(unsigned char)0x8D,
(char)(unsigned char)0xE2, (char)(unsigned char)0x9D, (char)(unsigned char)0x84,
(char)(unsigned char)0xEF, (char)(unsigned char)0xB8, (char)(unsigned char)0x8F,
(char)(unsigned char)0xE2, (char)(unsigned char)0x9D, (char)(unsigned char)0x84,
0x00,
};
static const unsigned short utf16[] = {
0xD83D, 0xDE85,
0xD83D, 0xDC3B,
0x200D,
0x2744,
0xFE0F,
0x2744,
0x0000,
};
const tetengo_text_encoder_t* const p_utf8_encoder =
char* const internal_encoding = (char*)malloc((internal_encoding_length + 1) * sizeof(char));
if (internal_encoding)
{
}
const tetengo_text_encoder_t* const p_utf16_encoder =
unsigned short* const encoded_utf16 = (unsigned short*)malloc((encoded_utf16_length + 1) * sizeof(unsigned short));
if (encoded_utf16)
{
}
assert(equal(encoded_utf16, utf16));
free(encoded_utf16);
free((void*)internal_encoding);
}
static int equal(const unsigned short* const string1, const unsigned short* const string2)
{
for (size_t i = 0;; ++i)
{
const unsigned short c1 = string1[i];
const unsigned short c2 = string2[i];
if (c1 != c2)
{
return 0;
}
if (c1 == 0U)
{
return 1;
}
}
}
Grapheme Splitting
C++
#include <cassert>
#include <iterator>
#include <locale>
#include <memory>
#include <optional>
#include <stdexcept>
#include <string>
#include <vector>
namespace usage_tetengo::text
{
constexpr char operator"" _c(unsigned long long value)
{
return static_cast<char>(value);
}
std::optional<std::locale> japanese_locale();
std::optional<std::locale> english_locale();
void grapheme_split()
{
static const std::string string_{
0xF0_c, 0x9F_c, 0x9A_c, 0x85_c,
0xF0_c, 0x9F_c, 0x90_c, 0xBB_c,
0xE2_c, 0x80_c, 0x8D_c,
0xE2_c, 0x9D_c, 0x84_c,
0xEF_c, 0xB8_c, 0x8F_c,
0xE2_c, 0x9D_c, 0x84_c,
};
{
auto o_locale = japanese_locale();
if (o_locale)
{
const auto graphemes = gs.
split(string_);
assert(std::size(graphemes) == 4U);
assert(graphemes[0].offset() == 0U && graphemes[0].width() == 2U);
assert(graphemes[1].offset() == 4U && graphemes[1].width() == 2U);
assert(graphemes[2].offset() == 17U && graphemes[2].width() == 2U);
assert(graphemes[3].offset() == 20U && graphemes[3].width() == 0U);
}
}
{
auto o_locale = english_locale();
if (o_locale)
{
const auto graphemes = gs.
split(string_);
assert(std::size(graphemes) == 4U);
assert(graphemes[0].offset() == 0U && graphemes[0].width() == 2U);
assert(graphemes[1].offset() == 4U && graphemes[1].width() == 2U);
assert(graphemes[2].offset() == 17U && graphemes[2].width() == 1U);
assert(graphemes[3].offset() == 20U && graphemes[3].width() == 0U);
}
}
}
std::optional<std::locale> japanese_locale()
{
try
{
#if defined(_WIN32)
return std::make_optional<std::locale>("Japanese_Japan.932");
#else
return std::make_optional<std::locale>("ja_JP.UTF-8");
#endif
}
catch (const std::runtime_error&)
{
return std::nullopt;
}
}
std::optional<std::locale> english_locale()
{
try
{
#if defined(_WIN32)
return std::make_optional<std::locale>("English_United States.1252");
#else
return std::make_optional<std::locale>("en_US.UTF-8");
#endif
}
catch (const std::runtime_error&)
{
return std::nullopt;
}
}
}
C
#include <assert.h>
#include <locale.h>
#include <stdlib.h>
#include <string.h>
static void save_current_locale(int category, char* storage, size_t storage_capacity);
static const char* japanese_locale_name();
static const char* english_locale_name();
void usage_tetengo_text_graphemeSplit()
{
static const char string[] = {
(char)(unsigned char)0xF0, (char)(unsigned char)0x9F, (char)(unsigned char)0x9A, (char)(unsigned char)0x85,
(char)(unsigned char)0xF0, (char)(unsigned char)0x9F, (char)(unsigned char)0x90, (char)(unsigned char)0xBB,
(char)(unsigned char)0xE2, (char)(unsigned char)0x80, (char)(unsigned char)0x8D,
(char)(unsigned char)0xE2, (char)(unsigned char)0x9D, (char)(unsigned char)0x84,
(char)(unsigned char)0xEF, (char)(unsigned char)0xB8, (char)(unsigned char)0x8F,
(char)(unsigned char)0xE2, (char)(unsigned char)0x9D, (char)(unsigned char)0x84,
0x00,
};
{
char initial_locale[256] = { '\0' };
save_current_locale(LC_CTYPE, initial_locale, 256);
if (setlocale(LC_CTYPE, japanese_locale_name()))
{
assert(grapheme_count == 4U);
if (p_graphemes)
{
assert(p_graphemes[0].offset == 0U && p_graphemes[0].width == 2U);
assert(p_graphemes[1].offset == 4U && p_graphemes[1].width == 2U);
assert(p_graphemes[2].offset == 17U && p_graphemes[2].width == 2U);
assert(p_graphemes[3].offset == 20U && p_graphemes[3].width == 0U);
free((void*)p_graphemes);
}
setlocale(LC_CTYPE, initial_locale);
}
}
{
char initial_locale[256] = { '\0' };
save_current_locale(LC_CTYPE, initial_locale, 256);
if (setlocale(LC_CTYPE, english_locale_name()))
{
assert(grapheme_count == 4U);
if (p_graphemes)
{
assert(p_graphemes[0].offset == 0U && p_graphemes[0].width == 2U);
assert(p_graphemes[1].offset == 4U && p_graphemes[1].width == 2U);
assert(p_graphemes[2].offset == 17U && p_graphemes[2].width == 1U);
assert(p_graphemes[3].offset == 20U && p_graphemes[3].width == 0U);
free((void*)p_graphemes);
}
setlocale(LC_CTYPE, initial_locale);
}
}
}
static void save_current_locale(const int category, char* const storage, const size_t storage_capacity)
{
const char* const current_locale = setlocale(category, NULL);
if (current_locale)
{
strncpy(storage, current_locale, storage_capacity - 1);
}
else
{
strncpy(storage, "C", storage_capacity - 1);
}
}
static const char* japanese_locale_name()
{
#if defined(_WIN32)
return "Japanese_Japan.932";
#else
return "ja_JP.UTF-8";
#endif
}
static const char* english_locale_name()
{
#if defined(_WIN32)
return "English_United States.1252";
#else
return "en_US.UTF-8";
#endif
}