A text library.
Usage
Character Encoding Conversion
C++
#include <cassert>
#include <string>
namespace usage_tetengo::text
{
constexpr char8_t operator""_u8c(unsigned long long value)
{
return static_cast<char8_t>(value);
}
void encoding()
{
static const std::u8string utf8{
0xF0_u8c, 0x9F_u8c, 0x9A_u8c, 0x85_u8c,
0xF0_u8c, 0x9F_u8c, 0x90_u8c, 0xBB_u8c,
0xE2_u8c, 0x80_u8c, 0x8D_u8c,
0xE2_u8c, 0x9D_u8c, 0x84_u8c,
0xEF_u8c, 0xB8_u8c, 0x8F_u8c,
0xE2_u8c, 0x9D_u8c, 0x84_u8c,
};
static const std::u16string utf16{
0xD83D, 0xDE85,
0xD83D, 0xDC3B,
0x200D,
0x2744,
0xFE0F,
0x2744,
};
const auto internal_encoding = utf8_encoder_type::instance().
decode(utf8);
const auto encoded_utf16 = utf16_encoder_type::instance().
encode(internal_encoding);
assert(encoded_utf16 == utf16);
}
}
An encoder.
Definition encoder.hpp:37
string_type decode(const encoded_string_view_type &encoded_string) const
Decodes a string.
Definition encoder.hpp:100
encoded_string_type encode(const string_view_type &string_) const
Encodes a string.
Definition encoder.hpp:88
C
#include <assert.h>
#include <stddef.h>
#include <stdlib.h>
static int equal(const unsigned short* string1, const unsigned short* string2);
void usage_tetengo_text_encoding()
{
static const char utf8[] = {
(char)(unsigned char)0xF0, (char)(unsigned char)0x9F, (char)(unsigned char)0x9A, (char)(unsigned char)0x85,
(char)(unsigned char)0xF0, (char)(unsigned char)0x9F, (char)(unsigned char)0x90, (char)(unsigned char)0xBB,
(char)(unsigned char)0xE2, (char)(unsigned char)0x80, (char)(unsigned char)0x8D,
(char)(unsigned char)0xE2, (char)(unsigned char)0x9D, (char)(unsigned char)0x84,
(char)(unsigned char)0xEF, (char)(unsigned char)0xB8, (char)(unsigned char)0x8F,
(char)(unsigned char)0xE2, (char)(unsigned char)0x9D, (char)(unsigned char)0x84,
0x00,
};
static const unsigned short utf16[] = {
0xD83D, 0xDE85,
0xD83D, 0xDC3B,
0x200D,
0x2744,
0xFE0F,
0x2744,
0x0000,
};
const tetengo_text_encoder_t* const p_utf8_encoder =
char* const internal_encoding = (char*)malloc((internal_encoding_length + 1) * sizeof(char));
if (internal_encoding)
{
}
const tetengo_text_encoder_t* const p_utf16_encoder =
unsigned short* const encoded_utf16 = (unsigned short*)malloc((encoded_utf16_length + 1) * sizeof(unsigned short));
if (encoded_utf16)
{
}
assert(equal(encoded_utf16, utf16));
free(encoded_utf16);
free((void*)internal_encoding);
}
static int equal(const unsigned short* const string1, const unsigned short* const string2)
{
for (size_t i = 0;; ++i)
{
const unsigned short c1 = string1[i];
const unsigned short c2 = string2[i];
if (c1 != c2)
{
return 0;
}
if (c1 == 0U)
{
return 1;
}
}
}
size_t tetengo_text_encoder_encode(const tetengo_text_encoder_t *p_encoder, const char *string, char *encoded_string, size_t encoded_string_capacity)
Encodes a string.
@ tetengo_text_encoder_encoding_utf16
Definition encoder.h:26
@ tetengo_text_encoder_encoding_utf8
Definition encoder.h:25
const tetengo_text_encoder_t * tetengo_text_encoder_instance(tetengo_text_encoder_encoding_t encoding)
Returns the encoder instance.
size_t tetengo_text_encoder_decode(const tetengo_text_encoder_t *p_encoder, const char *encoded_string, char *string, size_t string_capacity)
Decodes a string.
Grapheme Splitting
C++
#include <cassert>
#include <iterator>
#include <locale>
#include <memory>
#include <optional>
#include <stdexcept>
#include <string>
#include <vector>
namespace usage_tetengo::text
{
constexpr char operator"" _c(unsigned long long value)
{
return static_cast<char>(value);
}
std::optional<std::locale> japanese_locale();
std::optional<std::locale> english_locale();
void grapheme_split()
{
static const std::string string_{
0xF0_c, 0x9F_c, 0x9A_c, 0x85_c,
0xF0_c, 0x9F_c, 0x90_c, 0xBB_c,
0xE2_c, 0x80_c, 0x8D_c,
0xE2_c, 0x9D_c, 0x84_c,
0xEF_c, 0xB8_c, 0x8F_c,
0xE2_c, 0x9D_c, 0x84_c,
};
{
auto o_locale = japanese_locale();
if (o_locale)
{
const auto graphemes = gs.
split(string_);
assert(std::size(graphemes) == 4U);
assert(graphemes[0].offset() == 0U && graphemes[0].width() == 2U);
assert(graphemes[1].offset() == 4U && graphemes[1].width() == 2U);
assert(graphemes[2].offset() == 17U && graphemes[2].width() == 2U);
assert(graphemes[3].offset() == 20U && graphemes[3].width() == 0U);
}
}
{
auto o_locale = english_locale();
if (o_locale)
{
const auto graphemes = gs.
split(string_);
assert(std::size(graphemes) == 4U);
assert(graphemes[0].offset() == 0U && graphemes[0].width() == 2U);
assert(graphemes[1].offset() == 4U && graphemes[1].width() == 2U);
assert(graphemes[2].offset() == 17U && graphemes[2].width() == 1U);
assert(graphemes[3].offset() == 20U && graphemes[3].width() == 0U);
}
}
}
std::optional<std::locale> japanese_locale()
{
try
{
#if defined(_WIN32)
return std::make_optional<std::locale>("Japanese_Japan.932");
#else
return std::make_optional<std::locale>("ja_JP.UTF-8");
#endif
}
catch (const std::runtime_error&)
{
return std::nullopt;
}
}
std::optional<std::locale> english_locale()
{
try
{
#if defined(_WIN32)
return std::make_optional<std::locale>("English_United States.1252");
#else
return std::make_optional<std::locale>("en_US.UTF-8");
#endif
}
catch (const std::runtime_error&)
{
return std::nullopt;
}
}
}
A grapheme splitter.
Definition grapheme_splitter.hpp:89
std::vector< grapheme > split(const std::string_view &string_) const
Split a string to graphemes.
C
#include <assert.h>
#include <locale.h>
#include <stdlib.h>
#include <string.h>
static void save_current_locale(int category, char* storage, size_t storage_capacity);
static const char* japanese_locale_name();
static const char* english_locale_name();
void usage_tetengo_text_graphemeSplit()
{
static const char string[] = {
(char)(unsigned char)0xF0, (char)(unsigned char)0x9F, (char)(unsigned char)0x9A, (char)(unsigned char)0x85,
(char)(unsigned char)0xF0, (char)(unsigned char)0x9F, (char)(unsigned char)0x90, (char)(unsigned char)0xBB,
(char)(unsigned char)0xE2, (char)(unsigned char)0x80, (char)(unsigned char)0x8D,
(char)(unsigned char)0xE2, (char)(unsigned char)0x9D, (char)(unsigned char)0x84,
(char)(unsigned char)0xEF, (char)(unsigned char)0xB8, (char)(unsigned char)0x8F,
(char)(unsigned char)0xE2, (char)(unsigned char)0x9D, (char)(unsigned char)0x84,
0x00,
};
{
char initial_locale[256] = { '\0' };
save_current_locale(LC_CTYPE, initial_locale, 256);
if (setlocale(LC_CTYPE, japanese_locale_name()))
{
assert(grapheme_count == 4U);
if (p_graphemes)
{
assert(p_graphemes[0].offset == 0U && p_graphemes[0].width == 2U);
assert(p_graphemes[1].offset == 4U && p_graphemes[1].width == 2U);
assert(p_graphemes[2].offset == 17U && p_graphemes[2].width == 2U);
assert(p_graphemes[3].offset == 20U && p_graphemes[3].width == 0U);
free((void*)p_graphemes);
}
setlocale(LC_CTYPE, initial_locale);
}
}
{
char initial_locale[256] = { '\0' };
save_current_locale(LC_CTYPE, initial_locale, 256);
if (setlocale(LC_CTYPE, english_locale_name()))
{
assert(grapheme_count == 4U);
if (p_graphemes)
{
assert(p_graphemes[0].offset == 0U && p_graphemes[0].width == 2U);
assert(p_graphemes[1].offset == 4U && p_graphemes[1].width == 2U);
assert(p_graphemes[2].offset == 17U && p_graphemes[2].width == 1U);
assert(p_graphemes[3].offset == 20U && p_graphemes[3].width == 0U);
free((void*)p_graphemes);
}
setlocale(LC_CTYPE, initial_locale);
}
}
}
static void save_current_locale(const int category, char* const storage, const size_t storage_capacity)
{
const char* const current_locale = setlocale(category, NULL);
if (current_locale)
{
strncpy(storage, current_locale, storage_capacity - 1);
}
else
{
strncpy(storage, "C", storage_capacity - 1);
}
}
static const char* japanese_locale_name()
{
#if defined(_WIN32)
return "Japanese_Japan.932";
#else
return "ja_JP.UTF-8";
#endif
}
static const char* english_locale_name()
{
#if defined(_WIN32)
return "English_United States.1252";
#else
return "en_US.UTF-8";
#endif
}
size_t tetengo_text_graphemeSplitter_split(const tetengo_text_graphemeSplitter_t *p_grapheme_splitter, const char *string, tetengo_text_grapheme_t *p_graphemes)
Split a string to graphemes.
tetengo_text_graphemeSplitter_t * tetengo_text_graphemeSplitter_create()
Creates a grapheme splitter.
void tetengo_text_graphemeSplitter_destroy(const tetengo_text_graphemeSplitter_t *p_grapheme_splitter)
Destroys a grapheme splitter.
A grapheme.
Definition graphemeSplitter.h:26