tetengo 1.8.1
A multipurpose library set
Loading...
Searching...
No Matches
tetengo::text Namespace Reference

A text library. More...

Namespaces

namespace  encoding
 String encodings.
 
namespace  grapheme_splitting
 A grapheme splitting library.
 

Classes

class  encoder
 An encoder. More...
 
class  encoder_base
 An encoder base. More...
 
class  grapheme
 A grapheme. More...
 
class  grapheme_splitter
 A grapheme splitter. More...
 

Detailed Description

A text library.

Usage

Character Encoding Conversion

C++

#include <cassert>
#include <string>
#include <tetengo/text/encoding/utf16.hpp> // IWYU pragma: keep
#include <tetengo/text/encoding/utf8.hpp> // IWYU pragma: keep
namespace usage_tetengo::text
{
constexpr char8_t operator""_u8c(unsigned long long value)
{
return static_cast<char8_t>(value);
}
void encoding()
{
static const std::u8string utf8{
// clang-format off
// HIGH-SPEED TRAIN WITH BULLET NOSE
0xF0_u8c, 0x9F_u8c, 0x9A_u8c, 0x85_u8c, // U+1F685
// POLAR BEAR
0xF0_u8c, 0x9F_u8c, 0x90_u8c, 0xBB_u8c, // U+1F43B
0xE2_u8c, 0x80_u8c, 0x8D_u8c, // U+200D
0xE2_u8c, 0x9D_u8c, 0x84_u8c, // U+2744
0xEF_u8c, 0xB8_u8c, 0x8F_u8c, // U+FE0F
// SNOWFLAKE
0xE2_u8c, 0x9D_u8c, 0x84_u8c, // U+2744
// clang-format on
};
static const std::u16string utf16{
// clang-format off
// HIGH-SPEED TRAIN WITH BULLET NOSE
0xD83D, 0xDE85, // U+1F685
// POLAR BEAR
0xD83D, 0xDC3B, // U+1F43B
0x200D, // U+200D
0x2744, // U+2744
0xFE0F, // U+FE0F
// SNOWFLAKE
0x2744, // U+2744
// clang-format on
};
// Decodes UTF-8 to the internal encoding.
const auto internal_encoding = utf8_encoder_type::instance().decode(utf8);
// Encodes the internal encoding to UTF-16.
const auto encoded_utf16 = utf16_encoder_type::instance().encode(internal_encoding);
assert(encoded_utf16 == utf16);
}
}
An encoder.
Definition encoder.hpp:37
string_type decode(const encoded_string_view_type &encoded_string) const
Decodes a string.
Definition encoder.hpp:100
encoded_string_type encode(const string_view_type &string_) const
Encodes a string.
Definition encoder.hpp:88
An encoder.
A UTF-16 encoding.
A UTF-8 encoding.

C

#include <assert.h>
#include <stddef.h>
#include <stdlib.h>
static int equal(const unsigned short* string1, const unsigned short* string2);
void usage_tetengo_text_encoding()
{
static const char utf8[] = {
// clang-format off
// HIGH-SPEED TRAIN WITH BULLET NOSE
(char)(unsigned char)0xF0, (char)(unsigned char)0x9F, (char)(unsigned char)0x9A, (char)(unsigned char)0x85, // U+1F685
// POLAR BEAR
(char)(unsigned char)0xF0, (char)(unsigned char)0x9F, (char)(unsigned char)0x90, (char)(unsigned char)0xBB, // U+1F43B
(char)(unsigned char)0xE2, (char)(unsigned char)0x80, (char)(unsigned char)0x8D, // U+200D
(char)(unsigned char)0xE2, (char)(unsigned char)0x9D, (char)(unsigned char)0x84, // U+2744
(char)(unsigned char)0xEF, (char)(unsigned char)0xB8, (char)(unsigned char)0x8F, // U+FE0F
// SNOWFLAKE
(char)(unsigned char)0xE2, (char)(unsigned char)0x9D, (char)(unsigned char)0x84, // U+2744
0x00,
// clang-format on
};
static const unsigned short utf16[] = {
// clang-format off
// HIGH-SPEED TRAIN WITH BULLET NOSE
0xD83D, 0xDE85, // U+1F685
// POLAR BEAR
0xD83D, 0xDC3B, // U+1F43B
0x200D, // U+200D
0x2744, // U+2744
0xFE0F, // U+FE0F
// SNOWFLAKE
0x2744, // U+2744
0x0000,
// clang-format on
};
// Decodes UTF-8 to the internal encoding.
const tetengo_text_encoder_t* const p_utf8_encoder =
const size_t internal_encoding_length = tetengo_text_encoder_decode(p_utf8_encoder, utf8, NULL, 0);
char* const internal_encoding = (char*)malloc((internal_encoding_length + 1) * sizeof(char));
if (internal_encoding)
{
tetengo_text_encoder_decode(p_utf8_encoder, utf8, internal_encoding, internal_encoding_length + 1);
}
// Encodes the internal encoding to UTF-16.
const tetengo_text_encoder_t* const p_utf16_encoder =
const size_t encoded_utf16_length = tetengo_text_encoder_encode(p_utf16_encoder, internal_encoding, NULL, 0);
unsigned short* const encoded_utf16 = (unsigned short*)malloc((encoded_utf16_length + 1) * sizeof(unsigned short));
if (encoded_utf16)
{
tetengo_text_encoder_encode(p_utf16_encoder, internal_encoding, (char*)encoded_utf16, encoded_utf16_length + 1);
}
assert(equal(encoded_utf16, utf16));
free(encoded_utf16);
free((void*)internal_encoding);
}
static int equal(const unsigned short* const string1, const unsigned short* const string2)
{
for (size_t i = 0;; ++i)
{
const unsigned short c1 = string1[i];
const unsigned short c2 = string2[i];
if (c1 != c2)
{
return 0;
}
if (c1 == 0U)
{
return 1;
}
}
}
An encoder.
size_t tetengo_text_encoder_encode(const tetengo_text_encoder_t *p_encoder, const char *string, char *encoded_string, size_t encoded_string_capacity)
Encodes a string.
@ tetengo_text_encoder_encoding_utf16
Definition encoder.h:26
@ tetengo_text_encoder_encoding_utf8
Definition encoder.h:25
const tetengo_text_encoder_t * tetengo_text_encoder_instance(tetengo_text_encoder_encoding_t encoding)
Returns the encoder instance.
size_t tetengo_text_encoder_decode(const tetengo_text_encoder_t *p_encoder, const char *encoded_string, char *string, size_t string_capacity)
Decodes a string.

Grapheme Splitting

C++

#include <cassert>
#include <iterator>
#include <locale>
#include <memory>
#include <optional>
#include <stdexcept>
#include <string>
#include <vector>
namespace usage_tetengo::text
{
constexpr char operator"" _c(unsigned long long value)
{
return static_cast<char>(value);
}
std::optional<std::locale> japanese_locale();
std::optional<std::locale> english_locale();
void grapheme_split()
{
static const std::string string_{
// clang-format off
// HIGH-SPEED TRAIN WITH BULLET NOSE (width: 2)
0xF0_c, 0x9F_c, 0x9A_c, 0x85_c, // U+1F685
// POLAR BEAR (width: 2)
0xF0_c, 0x9F_c, 0x90_c, 0xBB_c, // U+1F43B
0xE2_c, 0x80_c, 0x8D_c, // U+200D
0xE2_c, 0x9D_c, 0x84_c, // U+2744
0xEF_c, 0xB8_c, 0x8F_c, // U+FE0F
// SNOWFLAKE (width: neutral 2 in the East Asian locale and 1 in the other locale)
0xE2_c, 0x9D_c, 0x84_c, // U+2744
// clang-format on
};
{
auto o_locale = japanese_locale();
if (o_locale)
{
// Creates a grapheme splitter for the Japanese locale.
const tetengo::text::grapheme_splitter gs{ *o_locale };
// Splits the string into graphemes.
// The splitter will return three graphemes.
// The width of the grapheme of SNOWFLAKE will be 2.
const auto graphemes = gs.split(string_);
assert(std::size(graphemes) == 4U);
assert(graphemes[0].offset() == 0U && graphemes[0].width() == 2U);
assert(graphemes[1].offset() == 4U && graphemes[1].width() == 2U);
assert(graphemes[2].offset() == 17U && graphemes[2].width() == 2U);
assert(graphemes[3].offset() == 20U && graphemes[3].width() == 0U);
}
}
{
auto o_locale = english_locale();
if (o_locale)
{
// Creates a grapheme splitter for the English locale.
const tetengo::text::grapheme_splitter gs{ *o_locale };
// Splits the string into graphemes.
// The splitter will return three graphemes.
// The width of the grapheme of SNOWFLAKE will be 1.
const auto graphemes = gs.split(string_);
assert(std::size(graphemes) == 4U);
assert(graphemes[0].offset() == 0U && graphemes[0].width() == 2U);
assert(graphemes[1].offset() == 4U && graphemes[1].width() == 2U);
assert(graphemes[2].offset() == 17U && graphemes[2].width() == 1U);
assert(graphemes[3].offset() == 20U && graphemes[3].width() == 0U);
}
}
}
std::optional<std::locale> japanese_locale()
{
try
{
#if defined(_WIN32)
return std::make_optional<std::locale>("Japanese_Japan.932");
#else
return std::make_optional<std::locale>("ja_JP.UTF-8");
#endif
}
catch (const std::runtime_error&)
{
return std::nullopt;
}
}
std::optional<std::locale> english_locale()
{
try
{
#if defined(_WIN32)
return std::make_optional<std::locale>("English_United States.1252");
#else
return std::make_optional<std::locale>("en_US.UTF-8");
#endif
}
catch (const std::runtime_error&)
{
return std::nullopt;
}
}
}
A grapheme splitter.
Definition grapheme_splitter.hpp:89
std::vector< grapheme > split(const std::string_view &string_) const
Split a string to graphemes.
A grapheme splitter.

C

#include <assert.h>
#include <locale.h>
#include <stdlib.h>
#include <string.h>
static void save_current_locale(int category, char* storage, size_t storage_capacity);
static const char* japanese_locale_name();
static const char* english_locale_name();
void usage_tetengo_text_graphemeSplit()
{
static const char string[] = {
// clang-format off
// HIGH-SPEED TRAIN WITH BULLET NOSE (width: 2)
(char)(unsigned char)0xF0, (char)(unsigned char)0x9F, (char)(unsigned char)0x9A, (char)(unsigned char)0x85, // U+1F685
// POLAR BEAR (width: 2)
(char)(unsigned char)0xF0, (char)(unsigned char)0x9F, (char)(unsigned char)0x90, (char)(unsigned char)0xBB, // U+1F43B
(char)(unsigned char)0xE2, (char)(unsigned char)0x80, (char)(unsigned char)0x8D, // U+200D
(char)(unsigned char)0xE2, (char)(unsigned char)0x9D, (char)(unsigned char)0x84, // U+2744
(char)(unsigned char)0xEF, (char)(unsigned char)0xB8, (char)(unsigned char)0x8F, // U+FE0F
// SNOWFLAKE (width: neutral 2 in the East Asian locale and 1 in the other locale)
(char)(unsigned char)0xE2, (char)(unsigned char)0x9D, (char)(unsigned char)0x84, // U+2744
0x00,
// clang-format on
};
{
// Switches the locale to Japanese.
char initial_locale[256] = { '\0' };
save_current_locale(LC_CTYPE, initial_locale, 256);
if (setlocale(LC_CTYPE, japanese_locale_name()))
{
// Creates a grapheme splitter for the Japanese locale.
const tetengo_text_graphemeSplitter_t* const p_gs = tetengo_text_graphemeSplitter_create();
/*
Splits the string into graphemes.
The splitter will return three graphemes.
The width of the grapheme of SNOWFLAKE will be 2.
*/
const size_t grapheme_count = tetengo_text_graphemeSplitter_split(p_gs, string, NULL);
assert(grapheme_count == 4U);
tetengo_text_grapheme_t* const p_graphemes =
(tetengo_text_grapheme_t*)malloc(grapheme_count * sizeof(tetengo_text_grapheme_t));
if (p_graphemes)
{
tetengo_text_graphemeSplitter_split(p_gs, string, p_graphemes);
assert(p_graphemes[0].offset == 0U && p_graphemes[0].width == 2U);
assert(p_graphemes[1].offset == 4U && p_graphemes[1].width == 2U);
assert(p_graphemes[2].offset == 17U && p_graphemes[2].width == 2U);
assert(p_graphemes[3].offset == 20U && p_graphemes[3].width == 0U);
free((void*)p_graphemes);
}
// Destroys the grapheme splitter.
// Restores the locale.
setlocale(LC_CTYPE, initial_locale);
}
}
{
// Switches the locale to English.
char initial_locale[256] = { '\0' };
save_current_locale(LC_CTYPE, initial_locale, 256);
if (setlocale(LC_CTYPE, english_locale_name()))
{
// Creates a grapheme splitter for the Japanese locale.
const tetengo_text_graphemeSplitter_t* const p_gs = tetengo_text_graphemeSplitter_create();
/*
Splits the string into graphemes.
The splitter will return three graphemes.
The width of the grapheme of SNOWFLAKE will be 1.
*/
const size_t grapheme_count = tetengo_text_graphemeSplitter_split(p_gs, string, NULL);
assert(grapheme_count == 4U);
tetengo_text_grapheme_t* const p_graphemes =
(tetengo_text_grapheme_t*)malloc(grapheme_count * sizeof(tetengo_text_grapheme_t));
if (p_graphemes)
{
tetengo_text_graphemeSplitter_split(p_gs, string, p_graphemes);
assert(p_graphemes[0].offset == 0U && p_graphemes[0].width == 2U);
assert(p_graphemes[1].offset == 4U && p_graphemes[1].width == 2U);
assert(p_graphemes[2].offset == 17U && p_graphemes[2].width == 1U);
assert(p_graphemes[3].offset == 20U && p_graphemes[3].width == 0U);
free((void*)p_graphemes);
}
// Destroys the grapheme splitter.
// Restores the locale.
setlocale(LC_CTYPE, initial_locale);
}
}
}
static void save_current_locale(const int category, char* const storage, const size_t storage_capacity)
{
const char* const current_locale = setlocale(category, NULL);
if (current_locale)
{
strncpy(storage, current_locale, storage_capacity - 1);
}
else
{
strncpy(storage, "C", storage_capacity - 1);
}
}
static const char* japanese_locale_name()
{
#if defined(_WIN32)
return "Japanese_Japan.932";
#else
return "ja_JP.UTF-8";
#endif
}
static const char* english_locale_name()
{
#if defined(_WIN32)
return "English_United States.1252";
#else
return "en_US.UTF-8";
#endif
}
A grapheme splitter.
size_t tetengo_text_graphemeSplitter_split(const tetengo_text_graphemeSplitter_t *p_grapheme_splitter, const char *string, tetengo_text_grapheme_t *p_graphemes)
Split a string to graphemes.
tetengo_text_graphemeSplitter_t * tetengo_text_graphemeSplitter_create()
Creates a grapheme splitter.
void tetengo_text_graphemeSplitter_destroy(const tetengo_text_graphemeSplitter_t *p_grapheme_splitter)
Destroys a grapheme splitter.
A grapheme.
Definition graphemeSplitter.h:26