1 module coda; 2 3 import core.stdc.stddef: wchar_t; 4 import std.conv: to; 5 import std..string: toStringz; 6 7 class SentenceSplitter 8 { 9 private cSentenceSplitter* _splitter; 10 11 this(in Tools.Language language) 12 { 13 _splitter = cSentenceSplitter_create(language); 14 } 15 16 ~this() 17 { 18 cSentenceSplitter_destroy(_splitter); 19 } 20 21 /// @returns Sentences array 22 Rstring[] split(Tstring, Rstring = string)(in Tstring line) 23 { 24 const wideString = line.to!( immutable(wchar_t)[] ); 25 auto borders = split(wideString); 26 27 Rstring[] ret; 28 29 size_t prev = 0; 30 31 foreach(b; borders) 32 { 33 const curr = b + 1; 34 ret ~= wideString[prev .. curr].to!Rstring; 35 prev = curr; 36 } 37 38 return ret; 39 } 40 41 /// @returns Borders 42 size_t[] split(in immutable(wchar_t)[] line) 43 { 44 size_t* borders; 45 size_t num = cSentenceSplitter_split(_splitter, line.ptr, line.length, &borders); 46 47 version(assert) 48 { 49 if(num != 0) 50 assert(borders !is null); 51 } 52 53 size_t[] ret = new size_t[num]; 54 55 foreach(i, ref symbolNum; ret) 56 symbolNum = borders[i]; 57 58 free_mem(borders); 59 60 return ret; 61 } 62 } 63 64 class Tokenizer 65 { 66 private cTokenizer* _tokenizer; 67 68 this(in Tools.Language language) 69 { 70 _tokenizer = cTokenizer_create(language); 71 } 72 73 ~this() 74 { 75 cTokenizer_destroy(_tokenizer); 76 } 77 78 TokensStorage tokenize(Tstring)(Tstring lineToSplit) 79 { 80 const s = lineToSplit.toWideStr; 81 82 cTokensStorage* storage = cTokenizer_tokenize(_tokenizer, s.ptr, s.length); 83 84 return new TokensStorage(storage); 85 } 86 } 87 88 class TokensStorage 89 { 90 private cTokensStorage* _storage; 91 92 private this(cTokensStorage* s) 93 { 94 _storage = s; 95 } 96 97 ~this() 98 { 99 cTokensStorage_destroy(_storage); 100 } 101 } 102 103 class DisambiguatedDataStorage 104 { 105 private cDisambiguatedDataStorage* _storage; 106 107 private this(cDisambiguatedDataStorage* s) 108 { 109 _storage = s; 110 } 111 112 ~this() 113 { 114 cDisambiguatedDataStorage_destroy(_storage); 115 } 116 } 117 118 class Disambiguator 119 { 120 private cDisambiguator* _disambiguator; 121 122 this(in Tools.Language language) 123 { 124 _disambiguator = cDisambiguator_create(language); 125 } 126 127 ~this() 128 { 129 cDisambiguator_destroy(_disambiguator); 130 } 131 132 DisambiguatedDataStorage disambiguate(TokensStorage ts) 133 { 134 auto r = cDisambiguator_disambiguate(_disambiguator, ts._storage); 135 136 return new DisambiguatedDataStorage(r); 137 } 138 } 139 140 class SyntaxParser 141 { 142 private cSyntaxParser* _syntaxParser; 143 144 this(in Tools.Language language) 145 { 146 PrepareConsole(language); 147 148 _syntaxParser = cSyntaxParser_create(language); 149 } 150 151 ~this() 152 { 153 cSyntaxParser_destroy(_syntaxParser); 154 } 155 156 SyntaxTree parse(DisambiguatedDataStorage ds) 157 { 158 auto r = cSyntaxParser_parse(_syntaxParser, ds._storage); 159 160 return new SyntaxTree(r); 161 } 162 } 163 164 class SyntaxTree 165 { 166 private cSyntaxTree* _tree; 167 168 private this(cSyntaxTree* t) 169 { 170 _tree = t; 171 } 172 173 ~this() 174 { 175 cSyntaxTree_destroy(_tree); 176 } 177 178 int getRootIndex() const 179 { 180 return cSyntaxTree_getRootIndex(_tree); 181 } 182 183 int getParentIndex(int nodeIndex) const 184 { 185 return cSyntaxTree_getParentIndex(_tree, nodeIndex); 186 } 187 188 SyntaxNode getNodeByIndex(int idx) 189 { 190 return SyntaxNode( 191 cSyntaxTree_getNodeByIndex(_tree, idx) 192 ); 193 } 194 195 override string toString() 196 { 197 string ret; 198 199 void dg(int currIdx, SyntaxNode* node, size_t depth) 200 { 201 string offset; 202 203 foreach(i; 0 .. depth) 204 offset ~= " "; 205 206 ret ~= offset ~ node.toString ~ "\n"; 207 } 208 209 recursiveTraversal(&dg); 210 211 return ret; 212 } 213 214 alias recursiveDg = void delegate(int currIdx, SyntaxNode* node, size_t depth); 215 216 void recursiveTraversal(recursiveDg dg) 217 { 218 recursiveTraversal(getRootIndex, dg, 0); 219 } 220 221 void recursiveTraversal(int currIdx, recursiveDg dg, size_t depth) 222 { 223 auto node = getNodeByIndex(currIdx); 224 dg(currIdx, &node, depth); 225 226 auto children = node.getChildrenIndexes; 227 228 foreach(childIdx; children) 229 recursiveTraversal(childIdx, dg, depth + 1); 230 } 231 } 232 233 struct SyntaxNode 234 { 235 private cSyntaxNode* _node; 236 private cNodeData _nd; 237 238 this(cSyntaxNode* n) 239 { 240 _node = n; 241 _nd = cSyntaxNode_get_cNodeData(_node); 242 } 243 244 int[] getChildrenIndexes() const 245 { 246 auto v = cSyntaxNode_getChildrenIndexes(_node); 247 248 return v.cIntVector_getPtr[0 .. v.cIntVector_getLength].dup; // TODO: move to separate function 249 } 250 251 T content(T = string)() { return _nd.content.cws2wch.to!T; } 252 bool isNextSpace() const { return _nd.isNextSpace; } 253 T lemma(T = string)() { return _nd.lemma.cws2wch.to!T; } 254 T label(T = string)() { return _nd.label.cws2wch.to!T; } 255 double weight() const { return _nd.weight; } 256 int lemmaId() const { return _nd.lemmaId; } 257 258 T[] punctuation(T = string)() 259 { 260 auto ret = new T[_nd.punctuation_size]; 261 262 foreach(i, ref str; ret) 263 str = _node.cSyntaxNode_getPunctuationByIndex(i).cws2wch.to!T; 264 265 return ret; 266 } 267 268 string toString() 269 { 270 return 271 "content="~content.to!string~ 272 " punctuation="~punctuation.to!string~ 273 " lemma="~lemma.to!string~ 274 " label="~label.to!string~ 275 " weight="~weight.to!string~ 276 " lemmaId="~lemmaId.to!string~ 277 " isNextSpace="~isNextSpace.to!string; 278 } 279 } 280 281 /// Convert cWstring to wchar_t array 282 private wchar_t[] cws2wch(cWstring* cws) 283 { 284 return cws.cWstring_getPtr[0 .. cws.cWstring_getLength]; 285 } 286 287 private const (immutable(wchar_t)[]) toWideStr(Tstr)(Tstr s) 288 { 289 return s.to!( immutable(wchar_t)[] ); 290 } 291 292 extern(C++, Tools) @nogc 293 { 294 public enum Language 295 { 296 RU, 297 EN, 298 EN_FAST 299 }; 300 301 private void PrepareConsole(Language language); 302 } 303 304 private @nogc 305 { 306 extern(C++, ccoda) 307 { 308 struct cSentenceSplitter; 309 struct cTokenizer; 310 struct cTokensStorage; 311 struct cDisambiguator; 312 struct cDisambiguatedDataStorage; 313 struct cSyntaxParser; 314 struct cSyntaxTree; 315 struct cSyntaxNode; 316 struct cIntVector; 317 struct cWstring; 318 319 struct cNodeData 320 { 321 cWstring* content; 322 size_t punctuation_size; 323 bool isNextSpace; 324 cWstring* lemma; /**< initial form of the token*/ 325 cWstring* label; /**< morphology label of the token*/ 326 double weight; /**< weight assigned to the label by the classifier*/ 327 int lemmaId; /**< index of lemma in database*/ 328 }; 329 } 330 331 extern(C++) 332 { 333 void free_mem(void* buf_ptr); // TODO: remove it 334 335 size_t cIntVector_getLength(const(cIntVector)* iv); 336 int* cIntVector_getPtr(const(cIntVector)* iv); 337 338 size_t cWstring_getLength(const cWstring* v); 339 wchar_t* cWstring_getPtr(cWstring* v); 340 341 cSentenceSplitter* cSentenceSplitter_create(Tools.Language); 342 void cSentenceSplitter_destroy(cSentenceSplitter*); 343 size_t cSentenceSplitter_split(cSentenceSplitter* splitter, const(wchar_t)* line_to_split, size_t line_length, size_t** borders); 344 345 cTokenizer* cTokenizer_create(Tools.Language language); 346 void cTokenizer_destroy(cTokenizer* tokenizer); 347 cTokensStorage* cTokenizer_tokenize(cTokenizer* tokenizer, const(wchar_t)* line_to_split, size_t line_length); 348 349 void cTokensStorage_destroy(cTokensStorage* ts); 350 351 cDisambiguator* cDisambiguator_create(Tools.Language language); 352 void cDisambiguator_destroy(cDisambiguator* d); 353 cDisambiguatedDataStorage* cDisambiguator_disambiguate(cDisambiguator* d, cTokensStorage* parsedTokens); 354 355 void cDisambiguatedDataStorage_destroy(cDisambiguatedDataStorage* ds); 356 357 cSyntaxParser* cSyntaxParser_create(Tools.Language language); 358 void cSyntaxParser_destroy(cSyntaxParser* sp); 359 cSyntaxTree* cSyntaxParser_parse(cSyntaxParser* syntax_parser, cDisambiguatedDataStorage* dds); 360 361 void cSyntaxTree_destroy(cSyntaxTree* t); 362 363 cSyntaxNode* cSyntaxTree_getNodeByIndex(cSyntaxTree* tree, size_t idx); 364 int cSyntaxTree_getRootIndex(const(cSyntaxTree)* tree); 365 int cSyntaxTree_getParentIndex(const(cSyntaxTree)* tree, int nodeIndex); 366 367 cIntVector* cSyntaxNode_getChildrenIndexes(const(cSyntaxNode)* node); 368 cNodeData cSyntaxNode_get_cNodeData(cSyntaxNode* node); 369 cWstring* cSyntaxNode_getPunctuationByIndex(cSyntaxNode* node, size_t idx); 370 } 371 } 372 373 unittest 374 { 375 { 376 import core.stdc.locale; 377 378 setlocale(LC_ALL, ""); 379 380 auto splitter = new SentenceSplitter(Tools.Language.RU); 381 382 string input = "Мальчик квадратный ковер выбивает. Дедушка круглый арбуз доедает... Тов. лейтенант, принесите 2 кг. арбузов!"; 383 384 auto res = splitter.split(input); 385 386 assert(res.length == 3); 387 assert(res[0] == "Мальчик квадратный ковер выбивает."); 388 assert(res[1] == " Дедушка круглый арбуз доедает..."); 389 assert(res[2] == " Тов. лейтенант, принесите 2 кг. арбузов!"); 390 391 auto tokenizer = new Tokenizer(Tools.Language.RU); 392 auto tokens = tokenizer.tokenize("Ежихи, постойте!"); 393 394 auto disambiguator = new Disambiguator(Tools.Language.RU); 395 auto disambiguated = disambiguator.disambiguate(tokens); 396 397 auto syntax_parser = new SyntaxParser(Tools.Language.RU); 398 auto tree = syntax_parser.parse(disambiguated); 399 auto root = tree.getRootIndex; 400 assert(root == 1); 401 402 auto rootNode = tree.getNodeByIndex(root); 403 assert(rootNode.lemma == "постоять"); 404 405 auto childrenIdxs = rootNode.getChildrenIndexes; 406 assert(childrenIdxs == [0]); 407 408 auto childNode = tree.getNodeByIndex(childrenIdxs[0]); 409 assert(childNode.lemma == "ежиха"); 410 assert(childNode.label == "S@МН@ЖЕН@ИМ@ОД"); 411 assert(childNode.punctuation == [","]); 412 413 auto parentIdx = tree.getParentIndex(childrenIdxs[0]); 414 assert(root == parentIdx); 415 } 416 }