⚡ Feature: Lexer improvements (#18)

* Created brsnch * Created brsnch (removed placeholder) * Lexer - Added tab handling for the presence such as spaces would be in. - Added unit tests for the new tab processing - Resolved issues where whitepsace was allowed before and or after the '.' character - Renamed isSpliter to isSplitter - Some Code styling * Check - Added two new `SymbolType`s for comments - `SINGLE_LINE_COMMENT` (for `//`) and `MULTI_LINE_COMMENT` (for `/*`) * Parser - Added a bogus `parseComment()` which returns nothing, prints out the comment, consumes the `Token` and returns - `parseStatement()` now supports `parseComment()` whenever a single-line or multi-line comment is detected * Parser - Fixed token consumption code in `parseComment()` * BasicLexer - Fixed style mishaps * ArrLexer - Implemented dummy lexer * Parser - Added some comment related functions (for testing) - Added `pushComment(Token)`, `hasCommentsOnStack() and `getCommentCount()` - `parseComment()` now pushes the current comment-based `Token` onto the comment-stack - Added a comment stack Unit tests - Added testing for various examples of comment-type `Token`s * Lexer - Replaced the characters with Enumerated type - Working Comment lexing, single and multiline - Working escape codes for strings - Working Signage and Size Encoder indicators - Removed floatMode in favour of float lexing function - Added doComment for the comment lexing instead of comment mode - Added doEscapeCode for escape codes in string Testing - Added unit tests for comments - Added unit tests numerical encoders - Added unit tests numerical encoders TODO - ADD unit tests for all valid escape sequences and some invalid * Lexer - Removed stringMode in favour of soString TODO - Decide on miltiline strings, currently not supported * Parser - Test comments which appear at a non-Module but rather statement lavel * Parser - Changed to using `BasicLexer` for comment tests now seeing as it is now implemented therein * Basic - Added `roll()` and `shourt()` to mark unittests * Basic - `shout()` now adds line number to print out * Lexer rewrite - flush - underscores in numbers - escape codes - character escapes - bug fixes * Basic - Fixed `shourt(int)` * Basic - Remved crashing (on purpose_ unittest * Resolved bug where isSplitter evaluated to true every time * Basic - Removed `goggaWithLineInfo(...)` * Basic - Updated `shout()` to remove rolling - Removed `roll()` - Added function and module name as well * Basic - Documented `shout()` * Lexer Done and 100% coverage * LexerSymbols - Documented - Formatted * Lexer (module) - Added `LS` alias - Added `isOperator(char c)`, `isSplitter(char c)`, `isNumericalEncoder_Size(char character)`, `isNumericalEncoder_Signage(char character)` and `isValidEscape_String(char character)` * BasicLexer - Documented constructor `hasToken()`, `performLex()`, `doIdentOrPath()`, `doChar()`, `doString()`, `doComment()`, `doEscapeCode()`, `doNumber()`, `doEncoder()`, `doFloat()`, `flush()`, `buildAdvance()`, `improvedAdvance()`, `advanceLine()`, `isOperator(char)`, `isSplitter(char)`, `isValidDotPrecede(char character)`, `isNumericalEncoder(char character)`, `isNumericalEncoder_Size(char character)`, `isNumericalEncoder_Signage(char character)` and `isValidEscape_String(char character)` - Tried reformatting some of `doChar()`, `doString()`, `flush()`, `buildAdvance()`, `improvedAdvance()`, `advanceLine()`, `isOperator(char)`, `isSplitter(char)` * Basic - Removed `LS` alias Lexer - Made `LS` alias public * BasicLexer - Removed methods `isValidEscape_String(char character)`, `isNumericalEncoder_Signage(char character)`, `isNumericalEncoder_Size(char character)`, `isNumericalEncoder(char character)`, `isSplitter(char c)` and ` isOperator(char c)` Lexer - Added method `isNumericalEncoder(char character)` * BasicLexer - Documented `isValidDotPrecede(char character)` * Lexer - Added method `isValidDotPrecede(char character)` * BasicLexer - Removed method `isValidDotPrecede(char character)` * BasicLexer (unittests) - Documented the unittests - Fixed formatting * BasicLexer - Typo fixes * BasicLexer (unittests) - Only compile-in `shourt(...)` when in unittest build mode * BasicLexer - Documented `isForward()` and `isBackward()` - Made `isBackward()` private --------- Co-authored-by: GMeyer <21568499@sun.ac.za> Co-authored-by: GMeyer <gustav.meyer1999@gmail.com>
2023-12-27 08:18:17 +02:00 · 2023-12-27 08:18:17 +02:00 · ee537f2b25
parent 4c3a72b026
commit ee537f2b25
5 changed files with 2007 additions and 575 deletions
--- a/source/tlang/compiler/lexer/core/lexer.d
+++ b/source/tlang/compiler/lexer/core/lexer.d
@ -4,6 +4,7 @@
 module tlang.compiler.lexer.core.lexer;

 import tlang.compiler.lexer.core.tokens : Token;
+import std.ascii : isDigit, isAlpha, isWhite;

 /** 
 * Defines the interface a lexer must provide
@ -73,4 +74,163 @@ public interface LexerInterface
     * Returns: a `Token[]` containing all tokens
     */
    public Token[] getTokens();
+}
+
+/** 
+ * Human-readable names assigned
+ * to commonly used character
+ * constants
+ */
+public enum LexerSymbols : char
+{
+    L_PAREN = '(',
+    R_PAREN = ')',
+    SEMI_COLON = ';',
+    COMMA = ',',
+    L_BRACK =  '[' ,
+    R_BRACK =  ']' ,
+    PLUS =  '+' ,
+    MINUS =  '-' ,
+    FORWARD_SLASH =  '/' ,
+    PERCENT =  '%' ,
+    STAR =  '*' ,
+    AMPERSAND =  '&' ,
+    L_BRACE =  '{' ,
+    R_BRACE =  '}' ,
+    EQUALS =  '=' ,
+    SHEFFER_STROKE =  '|' ,
+    CARET =  '^' ,
+    EXCLAMATION =  '!' ,
+    TILDE =  '~' ,
+    DOT =  '.' ,
+    COLON =  ':',
+    SPACE = ' ',
+    TAB = '\t',
+    NEWLINE = '\n',
+    DOUBLE_QUOTE = '"',
+    SINGLE_QUOTE =  '\'' ,
+    BACKSLASH =  '\\' ,
+    UNDERSCORE =  '_' ,
+    LESS_THAN =  '<' ,
+    BIGGER_THAN =  '>' ,
+
+    ESC_NOTHING =  '0' ,
+    ESC_CARRIAGE_RETURN =  'r' ,
+    ESC_TAB =  't' ,
+    ESC_NEWLINE =  'n' ,
+    ESC_BELL=  'a' ,
+
+    ENC_BYTE =  'B' ,
+    ENC_INT =  'I' ,
+    ENC_LONG =  'L' ,
+    ENC_WORD =  'W' ,
+    ENC_UNSIGNED =  'U' ,
+    ENC_SIGNED =  'S' ,
+}
+
+/** 
+ * Alias to `LexerSymbols`
+ */
+public alias LS = LexerSymbols;
+
+/** 
+ * Checks if the provided character is an operator
+ *
+ * Params:
+ *   c = the character to check
+ * Returns: `true` if it is a character, `false`
+ * otherwise
+ */
+public bool isOperator(char c)
+{
+    return c == LS.PLUS || c == LS.TILDE || c == LS.MINUS ||
+           c == LS.STAR || c == LS.FORWARD_SLASH || c == LS.AMPERSAND ||
+           c == LS.CARET || c == LS.EXCLAMATION || c == LS.SHEFFER_STROKE ||
+           c == LS.LESS_THAN || c == LS.BIGGER_THAN;
+}
+
+/** 
+ * Checks if the provided character is a splitter
+ *
+ * Params:
+ *   c = the character to check
+ * Returns: `true` if it is a splitter, `false`
+ * otherwise
+ */
+public bool isSplitter(char c)
+{
+    return c == LS.SEMI_COLON || c == LS.COMMA || c == LS.L_PAREN ||
+           c == LS.R_PAREN || c == LS.L_BRACK || c == LS.R_BRACK ||
+           c == LS.PERCENT || c == LS.L_BRACE || c == LS.R_BRACE ||
+           c == LS.EQUALS || c == LS.DOT || c == LS.COLON ||
+           isOperator(c) || isWhite(c);
+}
+
+/** 
+ * Checks if the provided character is a
+ * numerical size encoder
+ *
+ * Params:
+ *   character = the character to check
+ * Returns: `true` if so, `false` otheriwse
+ */
+public bool isNumericalEncoder_Size(char character)
+{
+    return character == LS.ENC_BYTE || character == LS.ENC_WORD ||
+           character == LS.ENC_INT || character == LS.ENC_LONG;
+}
+
+/** 
+ * Checks if the provided character is a
+ * numerical signage encoder
+ *
+ * Params:
+ *   character = the character to check
+ * Returns: `true` if so, `false` otherwise
+ */
+public bool isNumericalEncoder_Signage(char character)
+{
+    return character == LS.ENC_SIGNED || character == LS.ENC_UNSIGNED;
+}
+
+/** 
+ * Checks if the provided character is
+ * either a numerical size encoder
+ * or signage encoder
+ *
+ * Params:
+ *   character = the character to check
+ * Returns: `true` if so, `false` otherwise
+ */
+public bool isNumericalEncoder(char character)
+{
+    return isNumericalEncoder_Size(character) ||
+           isNumericalEncoder_Signage(character);
+}
+
+/** 
+ * Checks if the given character is a valid
+ * escape character (something which would 
+ * have followed a `\`)
+ *
+ * Params:
+ *   character = the character to check
+ * Returns: `true` if so, `false` otherwise
+ */
+public bool isValidEscape_String(char character)
+{
+    return character == LS.BACKSLASH || character == LS.DOUBLE_QUOTE || character == LS.SINGLE_QUOTE ||
+           character == LS.ESC_NOTHING || character == LS.ESC_NEWLINE  || character == LS.ESC_CARRIAGE_RETURN ||
+           character == LS.TAB || character == LS.ESC_BELL;
+}
+
+/**
+ * Given a character return whether it is valid entry
+ * for preceding a '.'.
+ *
+ * Returns: `true` if so, otherwise `false`
+ */
+public bool isValidDotPrecede(char character)
+{
+    return character == LS.R_PAREN || character == LS.R_BRACK; // || isAlpha(character) || isDigit(character);
 }
--- a/source/tlang/compiler/lexer/kinds/arr.d
+++ b/source/tlang/compiler/lexer/kinds/arr.d
@ -0,0 +1,124 @@
+module tlang.compiler.lexer.kinds.arr;
+
+import tlang.compiler.lexer.core;
+
+/** 
+ * An array-based tokenizer which takes a
+ * provided array of `Token[]`. useful
+ * for testing parser-only related things
+ * with concrete tokens
+ */
+public final class ArrLexer : LexerInterface
+{
+    /** 
+     * The concrete token source
+     */
+    private Token[] tokens;
+
+    /** 
+     * Position in the `tokens` array
+     */
+    private ulong tokenPtr = 0;
+
+    /** 
+     * Constructs a new `ArrLexer` (dummy lexer) with
+     * the tokens already in concrete form in the
+     * provided array.
+     *
+     * Params:
+     *   tokens = the `Token[]`
+     */
+    this(Token[] tokens)
+    {
+        this.tokens = tokens;
+    }
+
+    /** 
+     * Returns the token at the current cursor
+     * position
+     *
+     * Returns: the `Token`
+     */
+    public Token getCurrentToken()
+    {
+        return tokens[tokenPtr];
+    }
+
+    /** 
+     * Moves the cursor one token forward
+     */
+    public void nextToken()
+    {
+        tokenPtr++;
+    }
+
+    /** 
+     * Moves the cursor one token backwards
+     */
+    public void previousToken()
+    {
+        tokenPtr--;
+    }
+
+    /** 
+     * Sets the position of the cursor
+     *
+     * Params:
+     *   cursor = the new position
+     */
+    public void setCursor(ulong cursor)
+    {
+        this.tokenPtr = cursor;
+    }
+
+    /** 
+     * Retrieves the cursor's current position
+     *
+     * Returns: the position
+     */
+    public ulong getCursor()
+    {
+        return this.tokenPtr;
+    }
+
+    /** 
+     * Checks whether more tokens are available
+     * of not
+     *
+     * Returns: true if more tokens are available, false otherwise
+     */
+    public bool hasTokens()
+    {
+        return tokenPtr < tokens.length;
+    }
+
+    /** 
+     * Get the line position of the lexer in the source text
+     *
+     * Returns: the position
+     */
+    public ulong getLine()
+    {
+        return 0; // TODO: anything meaningful?
+    }
+
+    /** 
+     * Get the column position of the lexer in the source text
+     *
+     * Returns: the position
+     */
+    public ulong getColumn()
+    {
+        return 0; // TODO: anything meaningful?
+    }
+
+    /** 
+     * Exhaustively provide a list of all tokens
+     *
+     * Returns: a `Token[]` containing all tokens
+     */
+    public Token[] getTokens()
+    {
+        return tokens;
+    }
+}
--- a/source/tlang/compiler/lexer/kinds/basic.d
+++ b/source/tlang/compiler/lexer/kinds/basic.d
--- a/source/tlang/compiler/parsing/core.d
+++ b/source/tlang/compiler/parsing/core.d
@ -2008,6 +2008,124 @@ public final class Parser
        return statement;
    }

+    import std.container.slist : SList;
+    private SList!(Token) commentStack;
+    private void pushComment(Token commentToken)
+    {
+        // Sanity check
+        assert(getSymbolType(commentToken) == SymbolType.SINGLE_LINE_COMMENT ||
+               getSymbolType(commentToken) == SymbolType.MULTI_LINE_COMMENT
+              );
+
+        // Push it onto top of stack
+        commentStack.insertFront(commentToken);        
+    }
+    //TODO: Add a popToken() (also think if we want a stack-based mechanism)
+    private bool hasCommentsOnStack()
+    {
+        return getCommentCount() != 0;
+    }
+
+    private ulong getCommentCount()
+    {
+        import std.range : walkLength;
+        return walkLength(commentStack[]);
+    }
+
+    private void parseComment()
+    {
+        gprintln("parseComment(): Enter", DebugType.WARNING);
+
+        Token curCommentToken = lexer.getCurrentToken();
+
+        pushComment(curCommentToken);
+
+        // TODO: Do something here like placing it on some kind of stack
+        gprintln("Comment is: '"~curCommentToken.getToken()~"'");
+        lexer.nextToken(); // Move off comment
+
+        gprintln("parseComment(): Leave", DebugType.WARNING);
+    }
+
+    /** 
+     * Tests the handling of comments
+     */
+    unittest
+    {
+        import tlang.compiler.lexer.kinds.arr : ArrLexer;
+
+        string sourceCode = `module myCommentModule;
+        // Hello`;
+
+        LexerInterface currentLexer = new BasicLexer(sourceCode);
+        (cast(BasicLexer)currentLexer).performLex();
+
+        Parser parser = new Parser(currentLexer);
+    
+        try
+        {
+            Module modulle = parser.parse();
+
+            assert(parser.hasCommentsOnStack());
+            assert(parser.getCommentCount() == 1);
+        }
+        catch(TError e)
+        {
+            assert(false);
+        }
+
+        sourceCode = `module myCommntedModule;
+        /*Hello */
+        
+        /* Hello*/`;
+
+        currentLexer = new BasicLexer(sourceCode);
+        (cast(BasicLexer)currentLexer).performLex();
+        parser = new Parser(currentLexer);
+    
+        try
+        {
+            Module modulle = parser.parse();
+
+            assert(parser.hasCommentsOnStack());
+            assert(parser.getCommentCount() == 2);
+        }
+        catch(TError e)
+        {
+            assert(false);
+        }
+
+        sourceCode = `module myCommentedModule;
+
+        void function()
+        {
+            /*Hello */
+            /* Hello */
+            // Hello
+            //Hello
+        }
+        `;
+
+        currentLexer = new BasicLexer(sourceCode);
+        (cast(BasicLexer)currentLexer).performLex();
+        parser = new Parser(currentLexer);
+    
+        try
+        {
+            Module modulle = parser.parse();
+
+            assert(parser.hasCommentsOnStack());
+            assert(parser.getCommentCount() == 4);
+        }
+        catch(TError e)
+        {
+            assert(false);
+        }
+    }
+
+    // TODO: We need to add `parseComment()`
+    // support here (see issue #84)
+    // TODO: This ic currently dead code and ought to be used/implemented
    private Statement parseStatement(SymbolType terminatingSymbol = SymbolType.SEMICOLON)
    {
        gprintln("parseStatement(): Enter", DebugType.WARNING);
@ -2080,6 +2198,12 @@ public final class Parser
        {
            statement = parseDerefAssignment();
        }
+        /* If it is a kind-of comment */
+        else if(symbol == SymbolType.SINGLE_LINE_COMMENT || symbol == SymbolType.MULTI_LINE_COMMENT)
+        {
+            gprintln("COMMENTS NOT YET PROPERLY SUPOORTED", DebugType.ERROR);
+            parseComment();
+        }
        /* Error out */
        else
        {
@ -2303,6 +2427,12 @@ public final class Parser

                modulle.addStatement(externStatement);
            }
+            /* If it is a kind-of comment */
+            else if(symbol == SymbolType.SINGLE_LINE_COMMENT || symbol == SymbolType.MULTI_LINE_COMMENT)
+            {
+                gprintln("COMMENTS NOT YET PROPERLY SUPOORTED", DebugType.ERROR);
+                parseComment();
+            }
            else
            {
                expect("parse(): Unknown '" ~ tok.getToken() ~ "'");
--- a/source/tlang/compiler/symbols/check.d
+++ b/source/tlang/compiler/symbols/check.d
@ -290,6 +290,16 @@ public enum SymbolType
     */
    GENERIC_TYPE_DECLARE,

+    /**
+     * Multi-line comment (frwd-slash-star)
+     */
+    MULTI_LINE_COMMENT,
+
+    /**
+     * Singleiline comment (frwd-slash-slash)
+     */
+    SINGLE_LINE_COMMENT,
+
    /** 
     * Unknown symbol
     */
@ -780,6 +790,16 @@ public SymbolType getSymbolType(Token tokenIn)
    {
        return SymbolType.STAR;
    }
+    /* Multi-line comment (fwrd-slash-star) check */
+    else if(token[0] == '/' && token.length >= 2 && token[1]=='*')
+    {
+        return SymbolType.MULTI_LINE_COMMENT;
+    }
+    /* Single-line comment (fwrd-slash-slash) check */
+    else if(token[0] == '/' && token.length >= 2 && token[1]=='/')
+    {
+        return SymbolType.SINGLE_LINE_COMMENT;
+    }
    /* Divide `/` operator check  */
    else if(token[0] == '/')
    {