Euphoria
nlp_sentence.cc
Go to the documentation of this file.
1 #include "core/nlp_sentence.h"
2 
3 
4 #include <iostream>
5 #include <string_view>
6 
7 #include "base/stringutils.h"
8 #include "base/stringbuilder.h"
9 
10 
11 namespace eu::core
12 {
13 
14 
15 namespace
16 {
17 
18 bool
19 is_char_lower(char c)
20 {
21  return 'a' <= c && c <= 'z';
22 }
23 
24 
25 bool
26 is_char_upper(char c)
27 {
28  return 'A' <= c && c <= 'Z';
29 }
30 
31 
32 bool
33 is_word_char(char c)
34 {
35  // ' is using in words like can't
36  // - is used in words like right-handed
37  return is_char_upper(c) || is_char_lower(c) || is_number(c) || c == '\'' || c == '-';
38 }
39 
40 
41 bool
42 is_whitespace(char c)
43 {
44  switch(c)
45  {
46  case ' ':
47  case '\n':
48  case '\t':
49  case '\r': return true;
50  default: return false;
51  }
52 }
53 
54 
55 bool
56 is_end_of_sentence(char c)
57 {
58  switch(c)
59  {
60  case '.':
61  case '!':
62  case '?': return true;
63 
64  default: return false;
65  }
66 }
67 
68 
69 bool
70 is_comma_like(char c)
71 {
72  constexpr std::string_view special_words = ",;:\"[]()=";
73  return special_words.find(c) != std::string::npos;
74 }
75 
76 
77 int
78 get_char_code(char c)
79 {
80  return static_cast<int>(static_cast<unsigned char>(c));
81 }
82 
83 
84 struct SentenceParser
85 {
86  bool ok = true;
87  std::string buffer;
90 
91  int line = 1;
92  int ch = 0;
93 
94  void
95  add_word()
96  {
97  if(!buffer.empty())
98  {
99  words.push_back(buffer);
100  buffer = "";
101  }
102  }
103 
104  void
105  on_unknown_character(char c)
106  {
107  std::cout << "Unknown character(" << line << ":" << ch << "): " << c
108  << " (" << get_char_code(c) << ")\n";
109  ok = false;
110  }
111 
112  void
113  feed(char c)
114  {
115  if(get_char_code(c) >= 187)
116  {
117  return;
118  }
119 
120  if(c == '\n')
121  {
122  line += 1;
123  ch = 0;
124  }
125  else
126  {
127  ch += 1;
128  }
129 
130  if(is_word_char(c))
131  {
132  buffer += c;
133  return;
134  }
135 
136  if(is_whitespace(c))
137  {
138  add_word();
139  return;
140  }
141 
142  if(is_comma_like(c))
143  {
144  add_word();
145  words.push_back(std::string(1, c));
146  return;
147  }
148 
149  if(is_end_of_sentence(c))
150  {
151  add_word();
152  words.push_back(std::string(1, c));
154  words = TextSentence{};
155  return;
156  }
157 
158  on_unknown_character(c);
159  }
160 
161  void
162  on_complete()
163  {
164  if(words.empty() == false)
165  {
166  add_word();
168  words = TextSentence{};
169  }
170  }
171 };
172 }
173 
174 bool
176 {
177  std::string line;
178 
179  SentenceParser parser;
180  parser.on_sentence = on_sentence;
181 
182  while(std::getline(data, line))
183  {
184  if(line.empty())
185  {
186  continue;
187  }
188 
189  for(char c: line)
190  {
191  parser.feed(c);
192  }
193  parser.feed('\n');
194 
195  if(!parser.ok)
196  {
197  return false;
198  }
199  }
200 
201  parser.on_complete();
202 
203  return parser.ok;
204 }
205 
206 
207 std::string
209 {
210  auto ss = StringBuilder{};
211  bool first = true;
212 
213  for(const auto& w: s)
214  {
215  if(first)
216  {
217  first = false;
218  }
219  else
220  {
221  if(is_comma_like(w[0]) || is_end_of_sentence(w[0]))
222  {
223  }
224  else
225  {
226  ss.add_char(' ');
227  }
228  }
229 
230  ss.add_string(w);
231  }
232 
233  return ss.to_string();
234 }
235 
236 }
bool is_number(char b)
Definition: stringutils.cc:471
std::function< void(const TextSentence &)> OnSentenceFunction
Definition: nlp_sentence.h:12
bool parse_sentences(std::istream &data, OnSentenceFunction on_sentence)
std::vector< std::string > TextSentence
Definition: nlp_sentence.h:11
std::string from_sentence_to_string(const TextSentence &s)
bool ok
Definition: nlp_sentence.cc:86
int ch
Definition: nlp_sentence.cc:92
int line
Definition: nlp_sentence.cc:91
core::OnSentenceFunction on_sentence
Definition: nlp_sentence.cc:89
std::string buffer
Definition: nlp_sentence.cc:87
TextSentence words
Definition: nlp_sentence.cc:88
String utility functions.
std::string to_string()
Complete the builder and return the resulting string.
StringBuilder & add_string(const std::string &str)
StringBuilder & add_char(char c)