ddoc.lexer source code

1 /**
2  * This file is a forked part of libddoc:
3  * https://github.com/dlang-community/libddoc
4  * Copyright: © 2014 Economic Modeling Specialists, Intl.
5  * Main author of libddoc: Brian Schott
6  * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt Boost License 1.0)
7  * 
8  * Forked and modified in 2021 for hgen project by Eugene 'Vindex' Stulin.
9  * The 'hgen' project: https://gitlab.com/vindexbit/hgen
10  * Author: Eugene 'Vindex' Stulin <tech.vindex@gmail.com>
11  * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt Boost License 1.0)
12  */
13 module ddoc.lexer;
14 
15 import std.algorithm : startsWith;
16 import std.array : appender;
17 import std.conv : to;
18 import std.exception : enforce;
19 import std.utf : validate;
20 
21 
22 /**
23  * DDoc token types.
24  */
25 enum Type : ubyte {
26     lParen,     /// $(LPAREN)
27     rParen,     /// $(RPAREN)
28     dollar,     /// $
29     whitespace, /// whitespace
30     newline,    /// newline
31     embedded,   /// embedded D code
32     inlined,    /// backtick-inlined code
33     comma,      /// ,
34     equals,     /// =
35     header,     /// section header
36     word,       /// Anything else
37 }
38 
39 
40 /**
41  * DDoc token
42  */
43 struct Token {
44     string text;
45     Type type;
46 }
47 
48 /**
49  * Lexer for DDoc comments.
50  */
51 struct Lexer {
52     /**
53      * Params:
54      *     text = the _text to lex
55      */
56     this(string text, bool skipHeader = false) {
57         if (!isValidUTFString(text)) {
58             this.text = "";
59         } else {
60             this.text = text;
61         }
62         this.parseHeader = !skipHeader;
63         popFront();
64     }
65 
66     bool isValidUTFString(string s) {
67         try {
68             validate(s);
69         } catch (Exception e) {
70             return false;
71         }
72         return true;
73     }
74 
75     bool empty() const @property {
76         return _empty;
77     }
78 
79     const(Token) front() const @property {
80         return current;
81     }
82 
83     void popFront() {
84         if (offset >= text.length) {
85             _empty = true;
86         }
87         while (offset < text.length)
88         switch (text[offset]) {
89         case '`':
90             offset++;
91             immutable size_t inlineCode = inlineCodeIndex();
92             if (inlineCode == size_t.max) {
93                 current.text = "`";
94                 current.type = Type.word;
95             } else {
96                 current.text = text[offset .. inlineCode];
97                 current.type = Type.inlined;
98                 offset = inlineCode + 1;
99             }
100             return;
101         case ',':
102             current.text = text[offset .. offset + 1];
103             current.type = Type.comma;
104             offset++;
105             return;
106         case '=':
107             current.text = text[offset .. offset + 1];
108             current.type = Type.equals;
109             offset++;
110             return;
111         case '$':
112             current.text = text[offset .. offset + 1];
113             current.type = Type.dollar;
114             offset++;
115             return;
116         case '(':
117             current.text = text[offset .. offset + 1];
118             current.type = Type.lParen;
119             offset++;
120             return;
121         case ')':
122             current.text = text[offset .. offset + 1];
123             current.type = Type.rParen;
124             offset++;
125             return;
126         case '\r':
127             offset++;
128             goto case;
129         case '\n':
130             current.text = text[offset .. offset + 1];
131             current.type = Type.newline;
132             offset++;
133             return;
134         case '-':
135             bool threeHypnens = text[offset .. $].startsWith("---");
136             if (prevIsNewline(offset, text) && threeHypnens) {
137                 current.type = Type.embedded;
138                 // It's a string because user could mix spaces and tabs.
139                 string indent = getIndent(offset, text);
140                 // skip opening dashes
141                 while (offset < text.length && text[offset] == '-') {
142                     offset++;
143                 }
144                 if (offset < text.length && text[offset] == '\r') {
145                     offset++;
146                 }
147                 if (offset < text.length && text[offset] == '\n') {
148                     offset++;
149                     if (text.length > (offset + indent.length)
150                             && text[offset .. offset + indent.length] == indent)
151                     {
152                         offset += indent.length;
153                     }
154                 }
155                 // Loops until we find the closing '---'.
156                 // Note that some more checking should be put into this
157                 // to avoid accidentally matching '---' sequences.
158                 // If 'indent' is 0, then we can just take a slice. However, in
159                 // most cases, there will be some indent, and we need
160                 // to remove it for the code to look nice.
161                 size_t sliceBegin = offset;
162                 auto app = appender!string;
163                 while (true) {
164                     enforce!DdocException(
165                         offset < text.length, "Unterminated code block\n" ~ text
166                     );
167                     
168                     bool hypnen = text[offset] == '-';
169                     threeHypnens = text[offset .. $].startsWith("---");
170                     bool wasNewLine = prevIsNewline(offset, text);
171                     if (indent && text[offset] == '\n') {
172                         app.put(text[sliceBegin .. ++offset]);
173                         sliceBegin = offset;
174                         // We need to check if the indentation is the same
175                         if (text[sliceBegin .. $].startsWith(indent)) {
176                             sliceBegin += indent.length;
177                             offset += indent.length;
178                         }
179                     }
180                     // Check for the end.
181                     else if (hypnen && wasNewLine && threeHypnens) {
182                         if (sliceBegin >= offset) {
183                             current.text = indent.length==0 ? null : app.data;
184                         } else {
185                             auto slice = text[sliceBegin .. offset - 1];
186                             if (indent.length == 0) {
187                                 current.text = slice;
188                             } else {
189                                 app.put(slice);
190                                 current.text = app.data;
191                             }
192                         }
193                         // skip closing dashes
194                         while (offset < text.length && text[offset] == '-') {
195                             offset++;
196                         }
197                         break;
198                     } else {
199                         offset++;
200                     }
201                 }
202             } else {
203                 current.type = Type.word;
204                 current.text = "-";
205                 offset++;
206             }
207             return;
208         case ' ':
209         case '\t':
210             size_t oldOffset = offset;
211             auto isSpace = () => text[offset] == ' ' || text[offset] == '\t';
212             while (offset < text.length && isSpace()) {
213                 offset++;
214             }
215             current.type = Type.whitespace;
216             current.text = text[oldOffset .. offset];
217             return;
218         default:
219             lexWord();
220             return;
221         }
222     }
223 
224     private void lexWord() {
225         import std.utf : decode;
226         import std.uni : isNumber, isAlpha;
227 
228         size_t oldOffset = offset;
229         while (true) {
230             text.decode(offset);
231             if (offset >= text.length)
232                 break;
233             size_t o = offset;
234             dchar c = text.decode(o);
235             if (!(isAlpha(c) || isNumber(c)) && c != '_')
236                 break;
237         }
238         current.type = Type.word;
239         current.text = text[oldOffset .. offset];
240         bool isColon = offset < text.length && text[offset] == ':';
241         if (parseHeader && prevIsNewline(oldOffset, text) && isColon) {
242             current.type = Type.header;
243             offset++;
244         }
245     }
246 
247     size_t inlineCodeIndex() const {
248         size_t o = offset;
249         while (o < text.length) {
250             if (text[o .. $].startsWith("\r", "\n", "\u2028", "\u2029")) {
251                 return size_t.max;
252             } else if (text[o] == '`') {
253                 return o;
254             } else {
255                 o++;
256             }
257         }
258         return size_t.max;
259     }
260 
261     Token current;
262     size_t offset;
263     string text;
264     bool _empty;
265     bool parseHeader;
266 }
267 
268 
269 unittest {
270     import std.algorithm : map, equal;
271     import std.array : array;
272 
273     auto expected = [
274         Type.whitespace, Type.dollar, Type.lParen, Type.word,
275         Type.whitespace, Type.word, Type.comma, Type.whitespace,
276         Type.word, Type.rParen, Type.whitespace, Type.word,
277         Type.whitespace, Type.word, Type.newline, Type.embedded,
278         Type.newline, Type.header, Type.newline, Type.whitespace,
279         Type.word, Type.whitespace, Type.equals, Type.whitespace,
280         Type.dollar, Type.lParen, Type.word, Type.whitespace,
281         Type.word, Type.rParen, Type.newline, Type.header,
282         Type.newline, Type.whitespace, Type.word, Type.whitespace,
283         Type.word, Type.whitespace, Type.word
284     ];
285     Lexer l = Lexer(` $(D something, else) is *a
286 ------------
287 test
288 /** this is some test code */
289 assert (whatever);
290 ---------
291 Params:
292     a = $(A param)
293 Returns:
294     nothing of consequence`c);
295     //    foreach (t; l)
296     //        writeln(t);
297     assert(equal(l.map!(a => a.type), expected));
298 
299     auto expectedTexts2 = ["inlined code", " ", "identifier"];
300     auto expectedTypes2 = [Type.inlined, Type.whitespace, Type.word];
301     Lexer l2 = Lexer("`inlined code` identifier");
302     auto tokens = l2.array();
303     assert (equal(tokens.map!(a => a.type), expectedTypes2));
304     assert (equal(tokens.map!(a => a.text), expectedTexts2));
305 }
306 
307 
308 /*******************************************************************************
309  * Class for library exception.
310  *
311  * Most often, this is thrown when a Ddoc document is misformatted
312  * (unmatching parenthesis, too much arguments to a macro...).
313  */
314 class DdocException : Exception {
315     this(string msg,
316          string file = __FILE__,
317          size_t line = __LINE__,
318          Throwable next = null)
319     nothrow pure @safe {
320         super(msg, file, line, next);
321     }
322 
323     /***************************************************************************
324      * Allow method chaining:
325      * throw new DdocException().snippet(lexer.text);
326      */
327     @property DdocException snippet(string s) nothrow pure @safe @nogc {
328         m_snippet = s;
329         return this;
330     }
331 
332     @property string snippet() const nothrow pure @safe @nogc {
333         return m_snippet;
334     }
335 
336     private string m_snippet;
337 }
338 
339 
340 class DdocParseException : DdocException {
341     this(string msg,
342          string code,
343          string file = __FILE__,
344          size_t line = __LINE__,
345         Throwable next = null)
346     nothrow pure @safe {
347         super(msg, file, line, next);
348         this.snippet = code;
349     }
350 }
351 
352 
353 bool prevIsNewline(size_t offset, immutable string text) pure nothrow {
354     if (offset == 0) {
355         return true;
356     }
357     offset--;
358     while (offset > 0 && (text[offset] == ' ' || text[offset] == '\t'))
359         offset--;
360     return text[offset] == '\n';
361 }
362 
363 
364 /// Return the indentation present before the given offset.
365 /// offset should point past the indentation.
366 /// e.g. : '\t\ttest' => Offset should be 2 (the index of 't'),
367 ///        and getIndent will return '\t\t'. If offset is 1,
368 ///        getIndent returns '\t'.
369 string getIndent(size_t offset, string text) pure nothrow {
370     // If the offset is 0, or there's no indentation before.
371     if (offset < 1 || (text[offset - 1] != ' ' && text[offset - 1] != '\t'))
372         return null;
373 
374     // At this point we already know that there's one level of indentation.
375     size_t indent = 1;
376     while (offset >= (indent + 1) // Avoid underflow
377            && (text[offset - indent - 1] == ' '
378            || text[offset - indent - 1] == '\t')) {
379         indent++;
380     }
381     return text[offset - indent .. offset];
382 }
383 
384 unittest {
385     assert(" " == getIndent(1, "  test"));
386     assert("  " == getIndent(2, "  test"));
387     assert(!getIndent(3, "  test"));
388     assert("\t \t" == getIndent(3, "\t \ttest"));
389     assert("\t  " == getIndent(4, "\n\t  test"));
390 }