Euphoria
utf8.h
Go to the documentation of this file.
1 #pragma once
2 
3 #include <cstddef>
4 
5 
6 #include "assert/assert.h"
7 
8 namespace eu::core
9 {
10  template<typename TString, typename TOnCodepointFunc>
11  bool calc_utf8_to_codepoints(const TString& string, TOnCodepointFunc on_codepoint)
12  {
13  // reference: https://en.wikipedia.org/wiki/UTF-8
14  using CodePointInt = int;
15  auto to_byte = [](char c)->std::byte { return static_cast<std::byte>(c); };
16 
17  constexpr auto mask0 = std::byte{0b10000000}; constexpr auto bit0 = std::byte{0b00000000};
18  constexpr auto mask1 = std::byte{0b11000000}; constexpr auto bit1 = std::byte{0b10000000};
19  constexpr auto mask2 = std::byte{0b11100000}; constexpr auto bit2 = std::byte{0b11000000};
20  constexpr auto mask3 = std::byte{0b11110000}; constexpr auto bit3 = std::byte{0b11100000};
21  constexpr auto mask4 = std::byte{0b11111000}; constexpr auto bit4 = std::byte{0b11110000};
22 
23  unsigned int bits = 0;
24  std::vector<std::byte> buffer;
25 
26  for(auto c: string)
27  {
28  const auto b = to_byte(c);
29  if(bits == 0)
30  {
31  if( (mask0 & b) == bit0)
32  {
33  on_codepoint(std::to_integer<CodePointInt>(b));
34  }
35  else
36  {
37  buffer.push_back(b);
38  if((mask4 & b) == bit4)
39  {
40  bits = 4;
41  }
42  else if((mask3 & b) == bit3)
43  {
44  bits = 3;
45  }
46  else if((mask2 & b) == bit2)
47  {
48  bits = 2;
49  }
50  else if((mask1 & b) == bit1)
51  {
52  // stray continuation byte...
53  return false;
54  }
55  else
56  {
57  DIE("Implementation error?");
58  return false;
59  }
60  }
61  }
62  else
63  {
64  if((mask1 & b) != bit1)
65  {
66  // invalid continuation bit
67  return false;
68  }
69  buffer.push_back(b);
70  if(buffer.size() == bits)
71  {
72  switch(bits)
73  {
74  case 2:
75  on_codepoint
76  (
77  ( std::to_integer<CodePointInt>(buffer[0] & ~mask2) << 6) |
78  ( std::to_integer<CodePointInt>(buffer[1] & ~mask1) << 0)
79  );
80  break;
81  case 3:
82  on_codepoint
83  (
84  ( std::to_integer<CodePointInt>(buffer[0] & ~mask3) << 12) |
85  ( std::to_integer<CodePointInt>(buffer[1] & ~mask1) << 6 ) |
86  ( std::to_integer<CodePointInt>(buffer[2] & ~mask1) << 0 )
87  );
88  break;
89  case 4:
90  on_codepoint
91  (
92  ( std::to_integer<CodePointInt>(buffer[0] & ~mask4) << 18) |
93  ( std::to_integer<CodePointInt>(buffer[1] & ~mask1) << 12) |
94  ( std::to_integer<CodePointInt>(buffer[2] & ~mask1) << 6 ) |
95  ( std::to_integer<CodePointInt>(buffer[3] & ~mask1) << 0 )
96  );
97  break;
98  default:
99  // unhandled size
100  return false;
101  }
102 
103  bits = 0;
104  buffer.resize(0);
105  }
106  }
107  }
108 
109  // if bits != 0 this means a unfinished codepoint
110  return bits == 0;
111  }
112 }
#define DIE(message)
Definition: assert.h:67
bool calc_utf8_to_codepoints(const TString &string, TOnCodepointFunc on_codepoint)
Definition: utf8.h:11
std::string buffer
Definition: nlp_sentence.cc:87