1 /** 2 * This file is a forked part of libddoc: 3 * https://github.com/dlang-community/libddoc 4 * Copyright: © 2014 Economic Modeling Specialists, Intl. 5 * Main author of libddoc: Brian Schott 6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt Boost License 1.0) 7 * 8 * Forked and modified in 2021 for hgen project by Eugene 'Vindex' Stulin. 9 * The 'hgen' project: https://gitlab.com/vindexbit/hgen 10 * Author: Eugene 'Vindex' Stulin <tech.vindex@gmail.com> 11 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt Boost License 1.0) 12 */ 13 module ddoc.lexer; 14 15 import std.algorithm : startsWith; 16 import std.array : appender; 17 import std.conv : to; 18 import std.exception : enforce; 19 import std.utf : validate; 20 21 22 /** 23 * DDoc token types. 24 */ 25 enum Type : ubyte { 26 lParen, /// $(LPAREN) 27 rParen, /// $(RPAREN) 28 dollar, /// $ 29 whitespace, /// whitespace 30 newline, /// newline 31 embedded, /// embedded D code 32 inlined, /// backtick-inlined code 33 comma, /// , 34 equals, /// = 35 header, /// section header 36 word, /// Anything else 37 } 38 39 40 /** 41 * DDoc token 42 */ 43 struct Token { 44 string text; 45 Type type; 46 } 47 48 /** 49 * Lexer for DDoc comments. 50 */ 51 struct Lexer { 52 /** 53 * Params: 54 * text = the _text to lex 55 */ 56 this(string text, bool skipHeader = false) { 57 if (!isValidUTFString(text)) { 58 this.text = ""; 59 } else { 60 this.text = text; 61 } 62 this.parseHeader = !skipHeader; 63 popFront(); 64 } 65 66 bool isValidUTFString(string s) { 67 try { 68 validate(s); 69 } catch (Exception e) { 70 return false; 71 } 72 return true; 73 } 74 75 bool empty() const @property { 76 return _empty; 77 } 78 79 const(Token) front() const @property { 80 return current; 81 } 82 83 void popFront() { 84 if (offset >= text.length) { 85 _empty = true; 86 } 87 while (offset < text.length) 88 switch (text[offset]) { 89 case '`': 90 offset++; 91 immutable size_t inlineCode = inlineCodeIndex(); 92 if (inlineCode == size_t.max) { 93 current.text = "`"; 94 current.type = Type.word; 95 } else { 96 current.text = text[offset .. inlineCode]; 97 current.type = Type.inlined; 98 offset = inlineCode + 1; 99 } 100 return; 101 case ',': 102 current.text = text[offset .. offset + 1]; 103 current.type = Type.comma; 104 offset++; 105 return; 106 case '=': 107 current.text = text[offset .. offset + 1]; 108 current.type = Type.equals; 109 offset++; 110 return; 111 case '$': 112 current.text = text[offset .. offset + 1]; 113 current.type = Type.dollar; 114 offset++; 115 return; 116 case '(': 117 current.text = text[offset .. offset + 1]; 118 current.type = Type.lParen; 119 offset++; 120 return; 121 case ')': 122 current.text = text[offset .. offset + 1]; 123 current.type = Type.rParen; 124 offset++; 125 return; 126 case '\r': 127 offset++; 128 goto case; 129 case '\n': 130 current.text = text[offset .. offset + 1]; 131 current.type = Type.newline; 132 offset++; 133 return; 134 case '-': 135 bool threeHypnens = text[offset .. $].startsWith("---"); 136 if (prevIsNewline(offset, text) && threeHypnens) { 137 current.type = Type.embedded; 138 // It's a string because user could mix spaces and tabs. 139 string indent = getIndent(offset, text); 140 // skip opening dashes 141 while (offset < text.length && text[offset] == '-') { 142 offset++; 143 } 144 if (offset < text.length && text[offset] == '\r') { 145 offset++; 146 } 147 if (offset < text.length && text[offset] == '\n') { 148 offset++; 149 if (text.length > (offset + indent.length) 150 && text[offset .. offset + indent.length] == indent) 151 { 152 offset += indent.length; 153 } 154 } 155 // Loops until we find the closing '---'. 156 // Note that some more checking should be put into this 157 // to avoid accidentally matching '---' sequences. 158 // If 'indent' is 0, then we can just take a slice. However, in 159 // most cases, there will be some indent, and we need 160 // to remove it for the code to look nice. 161 size_t sliceBegin = offset; 162 auto app = appender!string; 163 while (true) { 164 enforce!DdocException( 165 offset < text.length, "Unterminated code block\n" ~ text 166 ); 167 168 bool hypnen = text[offset] == '-'; 169 threeHypnens = text[offset .. $].startsWith("---"); 170 bool wasNewLine = prevIsNewline(offset, text); 171 if (indent && text[offset] == '\n') { 172 app.put(text[sliceBegin .. ++offset]); 173 sliceBegin = offset; 174 // We need to check if the indentation is the same 175 if (text[sliceBegin .. $].startsWith(indent)) { 176 sliceBegin += indent.length; 177 offset += indent.length; 178 } 179 } 180 // Check for the end. 181 else if (hypnen && wasNewLine && threeHypnens) { 182 if (sliceBegin >= offset) { 183 current.text = indent.length==0 ? null : app.data; 184 } else { 185 auto slice = text[sliceBegin .. offset - 1]; 186 if (indent.length == 0) { 187 current.text = slice; 188 } else { 189 app.put(slice); 190 current.text = app.data; 191 } 192 } 193 // skip closing dashes 194 while (offset < text.length && text[offset] == '-') { 195 offset++; 196 } 197 break; 198 } else { 199 offset++; 200 } 201 } 202 } else { 203 current.type = Type.word; 204 current.text = "-"; 205 offset++; 206 } 207 return; 208 case ' ': 209 case '\t': 210 size_t oldOffset = offset; 211 auto isSpace = () => text[offset] == ' ' || text[offset] == '\t'; 212 while (offset < text.length && isSpace()) { 213 offset++; 214 } 215 current.type = Type.whitespace; 216 current.text = text[oldOffset .. offset]; 217 return; 218 default: 219 lexWord(); 220 return; 221 } 222 } 223 224 private void lexWord() { 225 import std.utf : decode; 226 import std.uni : isNumber, isAlpha; 227 228 size_t oldOffset = offset; 229 while (true) { 230 text.decode(offset); 231 if (offset >= text.length) 232 break; 233 size_t o = offset; 234 dchar c = text.decode(o); 235 if (!(isAlpha(c) || isNumber(c)) && c != '_') 236 break; 237 } 238 current.type = Type.word; 239 current.text = text[oldOffset .. offset]; 240 bool isColon = offset < text.length && text[offset] == ':'; 241 if (parseHeader && prevIsNewline(oldOffset, text) && isColon) { 242 current.type = Type.header; 243 offset++; 244 } 245 } 246 247 size_t inlineCodeIndex() const { 248 size_t o = offset; 249 while (o < text.length) { 250 if (text[o .. $].startsWith("\r", "\n", "\u2028", "\u2029")) { 251 return size_t.max; 252 } else if (text[o] == '`') { 253 return o; 254 } else { 255 o++; 256 } 257 } 258 return size_t.max; 259 } 260 261 Token current; 262 size_t offset; 263 string text; 264 bool _empty; 265 bool parseHeader; 266 } 267 268 269 unittest { 270 import std.algorithm : map, equal; 271 import std.array : array; 272 273 auto expected = [ 274 Type.whitespace, Type.dollar, Type.lParen, Type.word, 275 Type.whitespace, Type.word, Type.comma, Type.whitespace, 276 Type.word, Type.rParen, Type.whitespace, Type.word, 277 Type.whitespace, Type.word, Type.newline, Type.embedded, 278 Type.newline, Type.header, Type.newline, Type.whitespace, 279 Type.word, Type.whitespace, Type.equals, Type.whitespace, 280 Type.dollar, Type.lParen, Type.word, Type.whitespace, 281 Type.word, Type.rParen, Type.newline, Type.header, 282 Type.newline, Type.whitespace, Type.word, Type.whitespace, 283 Type.word, Type.whitespace, Type.word 284 ]; 285 Lexer l = Lexer(` $(D something, else) is *a 286 ------------ 287 test 288 /** this is some test code */ 289 assert (whatever); 290 --------- 291 Params: 292 a = $(A param) 293 Returns: 294 nothing of consequence`c); 295 // foreach (t; l) 296 // writeln(t); 297 assert(equal(l.map!(a => a.type), expected)); 298 299 auto expectedTexts2 = ["inlined code", " ", "identifier"]; 300 auto expectedTypes2 = [Type.inlined, Type.whitespace, Type.word]; 301 Lexer l2 = Lexer("`inlined code` identifier"); 302 auto tokens = l2.array(); 303 assert (equal(tokens.map!(a => a.type), expectedTypes2)); 304 assert (equal(tokens.map!(a => a.text), expectedTexts2)); 305 } 306 307 308 /******************************************************************************* 309 * Class for library exception. 310 * 311 * Most often, this is thrown when a Ddoc document is misformatted 312 * (unmatching parenthesis, too much arguments to a macro...). 313 */ 314 class DdocException : Exception { 315 this(string msg, 316 string file = __FILE__, 317 size_t line = __LINE__, 318 Throwable next = null) 319 nothrow pure @safe { 320 super(msg, file, line, next); 321 } 322 323 /*************************************************************************** 324 * Allow method chaining: 325 * throw new DdocException().snippet(lexer.text); 326 */ 327 @property DdocException snippet(string s) nothrow pure @safe @nogc { 328 m_snippet = s; 329 return this; 330 } 331 332 @property string snippet() const nothrow pure @safe @nogc { 333 return m_snippet; 334 } 335 336 private string m_snippet; 337 } 338 339 340 class DdocParseException : DdocException { 341 this(string msg, 342 string code, 343 string file = __FILE__, 344 size_t line = __LINE__, 345 Throwable next = null) 346 nothrow pure @safe { 347 super(msg, file, line, next); 348 this.snippet = code; 349 } 350 } 351 352 353 bool prevIsNewline(size_t offset, immutable string text) pure nothrow { 354 if (offset == 0) { 355 return true; 356 } 357 offset--; 358 while (offset > 0 && (text[offset] == ' ' || text[offset] == '\t')) 359 offset--; 360 return text[offset] == '\n'; 361 } 362 363 364 /// Return the indentation present before the given offset. 365 /// offset should point past the indentation. 366 /// e.g. : '\t\ttest' => Offset should be 2 (the index of 't'), 367 /// and getIndent will return '\t\t'. If offset is 1, 368 /// getIndent returns '\t'. 369 string getIndent(size_t offset, string text) pure nothrow { 370 // If the offset is 0, or there's no indentation before. 371 if (offset < 1 || (text[offset - 1] != ' ' && text[offset - 1] != '\t')) 372 return null; 373 374 // At this point we already know that there's one level of indentation. 375 size_t indent = 1; 376 while (offset >= (indent + 1) // Avoid underflow 377 && (text[offset - indent - 1] == ' ' 378 || text[offset - indent - 1] == '\t')) { 379 indent++; 380 } 381 return text[offset - indent .. offset]; 382 } 383 384 unittest { 385 assert(" " == getIndent(1, " test")); 386 assert(" " == getIndent(2, " test")); 387 assert(!getIndent(3, " test")); 388 assert("\t \t" == getIndent(3, "\t \ttest")); 389 assert("\t " == getIndent(4, "\n\t test")); 390 }