1 module coda;
2 
3 import core.stdc.stddef: wchar_t;
4 import std.conv: to;
5 import std..string: toStringz;
6 
7 class SentenceSplitter
8 {
9     private cSentenceSplitter* _splitter;
10 
11     this(in Tools.Language language)
12     {
13         _splitter = cSentenceSplitter_create(language);
14     }
15 
16     ~this()
17     {
18         cSentenceSplitter_destroy(_splitter);
19     }
20 
21     /// @returns Sentences array
22     Rstring[] split(Tstring, Rstring = string)(in Tstring line)
23     {
24         const wideString = line.to!( immutable(wchar_t)[] );
25         auto borders = split(wideString);
26 
27         Rstring[] ret;
28 
29         size_t prev = 0;
30 
31         foreach(b; borders)
32         {
33             const curr = b + 1;
34             ret ~= wideString[prev .. curr].to!Rstring;
35             prev = curr;
36         }
37 
38         return ret;
39     }
40 
41     /// @returns Borders
42     size_t[] split(in immutable(wchar_t)[] line)
43     {
44         size_t* borders;
45         size_t num = cSentenceSplitter_split(_splitter, line.ptr, line.length, &borders);
46 
47         version(assert)
48         {
49             if(num != 0)
50                 assert(borders !is null);
51         }
52 
53         size_t[] ret = new size_t[num];
54 
55         foreach(i, ref symbolNum; ret)
56             symbolNum = borders[i];
57 
58         free_mem(borders);
59 
60         return ret;
61     }
62 }
63 
64 class Tokenizer
65 {
66     private cTokenizer* _tokenizer;
67 
68     this(in Tools.Language language)
69     {
70         _tokenizer = cTokenizer_create(language);
71     }
72 
73     ~this()
74     {
75         cTokenizer_destroy(_tokenizer);
76     }
77 
78     TokensStorage tokenize(Tstring)(Tstring lineToSplit)
79     {
80         const s = lineToSplit.toWideStr;
81 
82         cTokensStorage* storage = cTokenizer_tokenize(_tokenizer, s.ptr, s.length);
83 
84         return new TokensStorage(storage);
85     }
86 }
87 
88 class TokensStorage
89 {
90     private cTokensStorage* _storage;
91 
92     private this(cTokensStorage* s)
93     {
94         _storage = s;
95     }
96 
97     ~this()
98     {
99         cTokensStorage_destroy(_storage);
100     }
101 }
102 
103 class DisambiguatedDataStorage
104 {
105     private cDisambiguatedDataStorage* _storage;
106 
107     private this(cDisambiguatedDataStorage* s)
108     {
109         _storage = s;
110     }
111 
112     ~this()
113     {
114         cDisambiguatedDataStorage_destroy(_storage);
115     }
116 }
117 
118 class Disambiguator
119 {
120     private cDisambiguator* _disambiguator;
121 
122     this(in Tools.Language language)
123     {
124         _disambiguator = cDisambiguator_create(language);
125     }
126 
127     ~this()
128     {
129         cDisambiguator_destroy(_disambiguator);
130     }
131 
132     DisambiguatedDataStorage disambiguate(TokensStorage ts)
133     {
134         auto r = cDisambiguator_disambiguate(_disambiguator, ts._storage);
135 
136         return new DisambiguatedDataStorage(r);
137     }
138 }
139 
140 class SyntaxParser
141 {
142     private cSyntaxParser* _syntaxParser;
143 
144     this(in Tools.Language language)
145     {
146         PrepareConsole(language);
147 
148         _syntaxParser = cSyntaxParser_create(language);
149     }
150 
151     ~this()
152     {
153         cSyntaxParser_destroy(_syntaxParser);
154     }
155 
156     SyntaxTree parse(DisambiguatedDataStorage ds)
157     {
158         auto r = cSyntaxParser_parse(_syntaxParser, ds._storage);
159 
160         return new SyntaxTree(r);
161     }
162 }
163 
164 class SyntaxTree
165 {
166     private cSyntaxTree* _tree;
167 
168     private this(cSyntaxTree* t)
169     {
170         _tree = t;
171     }
172 
173     ~this()
174     {
175         cSyntaxTree_destroy(_tree);
176     }
177 
178     int getRootIndex() const
179     {
180         return cSyntaxTree_getRootIndex(_tree);
181     }
182 
183     int getParentIndex(int nodeIndex) const
184     {
185         return cSyntaxTree_getParentIndex(_tree, nodeIndex);
186     }
187 
188     SyntaxNode getNodeByIndex(int idx)
189     {
190         return SyntaxNode(
191             cSyntaxTree_getNodeByIndex(_tree, idx)
192         );
193     }
194 
195     override string toString()
196     {
197         string ret;
198 
199         void dg(int currIdx, SyntaxNode* node, size_t depth)
200         {
201             string offset;
202 
203             foreach(i; 0 .. depth)
204                 offset ~= "  ";
205 
206             ret ~= offset ~ node.toString ~ "\n";
207         }
208 
209         recursiveTraversal(&dg);
210 
211         return ret;
212     }
213 
214     alias recursiveDg = void delegate(int currIdx, SyntaxNode* node, size_t depth);
215 
216     void recursiveTraversal(recursiveDg dg)
217     {
218         recursiveTraversal(getRootIndex, dg, 0);
219     }
220 
221     void recursiveTraversal(int currIdx, recursiveDg dg, size_t depth)
222     {
223         auto node = getNodeByIndex(currIdx);
224         dg(currIdx, &node, depth);
225 
226         auto children = node.getChildrenIndexes;
227 
228         foreach(childIdx; children)
229             recursiveTraversal(childIdx, dg, depth + 1);
230     }
231 }
232 
233 struct SyntaxNode
234 {
235     private cSyntaxNode* _node;
236     private cNodeData _nd;
237 
238     this(cSyntaxNode* n)
239     {
240         _node = n;
241         _nd = cSyntaxNode_get_cNodeData(_node);
242     }
243 
244     int[] getChildrenIndexes() const
245     {
246         auto v = cSyntaxNode_getChildrenIndexes(_node);
247 
248         return v.cIntVector_getPtr[0 .. v.cIntVector_getLength].dup; // TODO: move to separate function
249     }
250 
251     T content(T = string)() { return _nd.content.cws2wch.to!T; }
252     bool isNextSpace() const { return _nd.isNextSpace; }
253     T lemma(T = string)() { return _nd.lemma.cws2wch.to!T; }
254     T label(T = string)() { return _nd.label.cws2wch.to!T; }
255     double weight() const { return _nd.weight; }
256     int lemmaId() const { return _nd.lemmaId; }
257 
258     T[] punctuation(T = string)()
259     {
260         auto ret = new T[_nd.punctuation_size];
261 
262         foreach(i, ref str; ret)
263             str = _node.cSyntaxNode_getPunctuationByIndex(i).cws2wch.to!T;
264 
265         return ret;
266     }
267 
268     string toString()
269     {
270         return
271             "content="~content.to!string~
272             " punctuation="~punctuation.to!string~
273             " lemma="~lemma.to!string~
274             " label="~label.to!string~
275             " weight="~weight.to!string~
276             " lemmaId="~lemmaId.to!string~
277             " isNextSpace="~isNextSpace.to!string;
278     }
279 }
280 
281 /// Convert cWstring to wchar_t array
282 private wchar_t[] cws2wch(cWstring* cws)
283 {
284     return cws.cWstring_getPtr[0 .. cws.cWstring_getLength];
285 }
286 
287 private const (immutable(wchar_t)[]) toWideStr(Tstr)(Tstr s)
288 {
289     return s.to!( immutable(wchar_t)[] );
290 }
291 
292 extern(C++, Tools) @nogc
293 {
294     public enum Language
295     {
296         RU,
297         EN,
298         EN_FAST
299     };
300 
301     private void PrepareConsole(Language language);
302 }
303 
304 private @nogc
305 {
306     extern(C++, ccoda)
307     {
308         struct cSentenceSplitter;
309         struct cTokenizer;
310         struct cTokensStorage;
311         struct cDisambiguator;
312         struct cDisambiguatedDataStorage;
313         struct cSyntaxParser;
314         struct cSyntaxTree;
315         struct cSyntaxNode;
316         struct cIntVector;
317         struct cWstring;
318 
319         struct cNodeData
320         {
321             cWstring* content;
322             size_t punctuation_size;
323             bool isNextSpace;
324             cWstring* lemma; /**< initial form of the token*/
325             cWstring* label; /**< morphology label of the token*/
326             double weight; /**< weight assigned to the label by the classifier*/
327             int lemmaId; /**< index of lemma in database*/
328         };
329     }
330 
331     extern(C++)
332     {
333         void free_mem(void* buf_ptr); // TODO: remove it
334 
335         size_t cIntVector_getLength(const(cIntVector)* iv);
336         int* cIntVector_getPtr(const(cIntVector)* iv);
337 
338         size_t cWstring_getLength(const cWstring* v);
339         wchar_t* cWstring_getPtr(cWstring* v);
340 
341         cSentenceSplitter* cSentenceSplitter_create(Tools.Language);
342         void cSentenceSplitter_destroy(cSentenceSplitter*);
343         size_t cSentenceSplitter_split(cSentenceSplitter* splitter, const(wchar_t)* line_to_split, size_t line_length, size_t** borders);
344 
345         cTokenizer* cTokenizer_create(Tools.Language language);
346         void cTokenizer_destroy(cTokenizer* tokenizer);
347         cTokensStorage* cTokenizer_tokenize(cTokenizer* tokenizer, const(wchar_t)* line_to_split, size_t line_length);
348 
349         void cTokensStorage_destroy(cTokensStorage* ts);
350 
351         cDisambiguator* cDisambiguator_create(Tools.Language language);
352         void cDisambiguator_destroy(cDisambiguator* d);
353         cDisambiguatedDataStorage* cDisambiguator_disambiguate(cDisambiguator* d, cTokensStorage* parsedTokens);
354 
355         void cDisambiguatedDataStorage_destroy(cDisambiguatedDataStorage* ds);
356 
357         cSyntaxParser* cSyntaxParser_create(Tools.Language language);
358         void cSyntaxParser_destroy(cSyntaxParser* sp);
359         cSyntaxTree* cSyntaxParser_parse(cSyntaxParser* syntax_parser, cDisambiguatedDataStorage* dds);
360 
361         void cSyntaxTree_destroy(cSyntaxTree* t);
362 
363         cSyntaxNode* cSyntaxTree_getNodeByIndex(cSyntaxTree* tree, size_t idx);
364         int cSyntaxTree_getRootIndex(const(cSyntaxTree)* tree);
365         int cSyntaxTree_getParentIndex(const(cSyntaxTree)* tree, int nodeIndex);
366 
367         cIntVector* cSyntaxNode_getChildrenIndexes(const(cSyntaxNode)* node);
368         cNodeData cSyntaxNode_get_cNodeData(cSyntaxNode* node);
369         cWstring* cSyntaxNode_getPunctuationByIndex(cSyntaxNode* node, size_t idx);
370     }
371 }
372 
373 unittest
374 {
375     {
376         import core.stdc.locale;
377 
378         setlocale(LC_ALL, "");
379 
380         auto splitter = new SentenceSplitter(Tools.Language.RU);
381 
382         string input = "Мальчик квадратный ковер выбивает. Дедушка круглый арбуз доедает... Тов. лейтенант, принесите 2 кг. арбузов!";
383 
384         auto res = splitter.split(input);
385 
386         assert(res.length == 3);
387         assert(res[0] == "Мальчик квадратный ковер выбивает.");
388         assert(res[1] == " Дедушка круглый арбуз доедает...");
389         assert(res[2] == " Тов. лейтенант, принесите 2 кг. арбузов!");
390 
391         auto tokenizer = new Tokenizer(Tools.Language.RU);
392         auto tokens = tokenizer.tokenize("Ежихи, постойте!");
393 
394         auto disambiguator = new Disambiguator(Tools.Language.RU);
395         auto disambiguated = disambiguator.disambiguate(tokens);
396 
397         auto syntax_parser = new SyntaxParser(Tools.Language.RU);
398         auto tree = syntax_parser.parse(disambiguated);
399         auto root = tree.getRootIndex;
400         assert(root == 1);
401 
402         auto rootNode = tree.getNodeByIndex(root);
403         assert(rootNode.lemma == "постоять");
404 
405         auto childrenIdxs = rootNode.getChildrenIndexes;
406         assert(childrenIdxs == [0]);
407 
408         auto childNode = tree.getNodeByIndex(childrenIdxs[0]);
409         assert(childNode.lemma == "ежиха");
410         assert(childNode.label == "S@МН@ЖЕН@ИМ@ОД");
411         assert(childNode.punctuation == [","]);
412 
413         auto parentIdx = tree.getParentIndex(childrenIdxs[0]);
414         assert(root == parentIdx);
415     }
416 }