From 93d234cd48404867b1d80a727d17a4b4a0726e1b Mon Sep 17 00:00:00 2001
From: Aryadev Chavali <aryadev@aryadevchavali.com>
Date: Wed, 1 Nov 2023 14:38:59 +0000
Subject: Lexer now returns more descriptive tokens

More useful tokens, in particular for each opcode possible.  This
makes parsing a simpler task to reason as now we're just checking
against an enum rather than doing a string check in linear time.

It makes more sense to do this at the tokeniser as the local data from
the buffer will be in the cache most likely as the buffer is
contiguously allocated.  While it will always be slow to do linear
time checks on strings, when doing it at the parser we're having to
check strings that may be allocated in a variety of different places.
This means caching becomes a harder task, but with this approach we're
less likely to have cache misses as long as the buffer stays there.
---
 asm/lexer.c | 378 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 351 insertions(+), 27 deletions(-)

(limited to 'asm/lexer.c')

diff --git a/asm/lexer.c b/asm/lexer.c
index a02c8e1..a4d9f5a 100644
--- a/asm/lexer.c
+++ b/asm/lexer.c
@@ -10,11 +10,14 @@
  * Description: Lexer for assembly language
  */
 
+#include <assert.h>
 #include <ctype.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <string.h>
 
+#include <lib/inst.h>
+
 #include "./lexer.h"
 
 const char *token_type_as_cstr(token_type_t type)
@@ -25,6 +28,46 @@ const char *token_type_as_cstr(token_type_t type)
     return "LITERAL_NUMBER";
   case TOKEN_LITERAL_CHAR:
     return "LITERAL_CHAR";
+  case TOKEN_NOOP:
+    return "NOOP";
+  case TOKEN_HALT:
+    return "HALT";
+  case TOKEN_PUSH:
+    return "PUSH";
+  case TOKEN_POP:
+    return "POP";
+  case TOKEN_PUSH_REG:
+    return "PUSH_REG";
+  case TOKEN_MOV:
+    return "MOV";
+  case TOKEN_DUP:
+    return "DUP";
+  case TOKEN_NOT:
+    return "NOT";
+  case TOKEN_OR:
+    return "OR";
+  case TOKEN_AND:
+    return "AND";
+  case TOKEN_XOR:
+    return "XOR";
+  case TOKEN_EQ:
+    return "EQ";
+  case TOKEN_LT:
+    return "LT";
+  case TOKEN_LTE:
+    return "LTE";
+  case TOKEN_GT:
+    return "GT";
+  case TOKEN_GTE:
+    return "GTE";
+  case TOKEN_PLUS:
+    return "PLUS";
+  case TOKEN_PRINT:
+    return "PRINT";
+  case TOKEN_JUMP:
+    return "JUMP";
+  case TOKEN_JUMP_IF:
+    return "JUMP_IF";
   case TOKEN_SYMBOL:
     return "SYMBOL";
   }
@@ -52,32 +95,179 @@ size_t space_left(buffer_t *buffer)
   return buffer->available - 1 - buffer->used;
 }
 
+char uppercase(char c)
+{
+  if (c >= 'a' && c <= 'z')
+    return (c - 'a') + 'A';
+  return c;
+}
+
 bool is_symbol(char c)
 {
   return isalpha(c) || c == '-' || c == '_' || c == '.';
 }
 
-char uppercase(char c)
+bool is_valid_hex_char(char c)
 {
-  if (c >= 'a' && c <= 'z')
-    return (c - 'a') + 'A';
-  return c;
+  return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') ||
+         (c >= 'A' && c <= 'F');
 }
 
 token_t tokenise_symbol(buffer_t *buffer, size_t *column)
 {
-  token_t token = {.type = TOKEN_SYMBOL, .str_size = 0, .column = *column};
-  for (; token.str_size < space_left(buffer) &&
-         is_symbol(buffer->data[buffer->used + token.str_size]);
-       ++token.str_size)
-    buffer->data[buffer->used + token.str_size] =
-        uppercase(buffer->data[buffer->used + token.str_size]);
-  token.str = calloc(token.str_size + 1, 1);
-  memcpy(token.str, buffer->data + buffer->used, token.str_size);
-  token.str[token.str_size] = '\0';
-  buffer->used += token.str_size;
-  *column += token.str_size;
-  return token;
+  static_assert(NUMBER_OF_OPCODES == 70, "tokenise_buffer: Out of date!");
+
+  size_t sym_size = 0;
+  for (; sym_size < space_left(buffer) &&
+         is_symbol(buffer->data[buffer->used + sym_size]);
+       ++sym_size)
+    buffer->data[buffer->used + sym_size] =
+        uppercase(buffer->data[buffer->used + sym_size]);
+
+  token_t ret  = {0};
+  char *opcode = (char *)buffer->data + buffer->used;
+
+  bool is_opcode    = true;
+  token_type_t type = 0;
+  size_t offset     = 0;
+
+  if (sym_size == 4 && strncmp(opcode, "NOOP", 4) == 0)
+  {
+    offset = 4;
+    type   = TOKEN_NOOP;
+  }
+  else if (sym_size == 4 && strncmp(opcode, "HALT", 4) == 0)
+  {
+    offset = 4;
+    type   = TOKEN_HALT;
+  }
+  else if (sym_size >= 8 && strncmp(opcode, "PUSH.REG", 8) == 0)
+  {
+    offset = 8;
+    type   = TOKEN_PUSH_REG;
+  }
+  else if (sym_size >= 4 && strncmp(opcode, "PUSH", 4) == 0)
+  {
+    offset = 4;
+    type   = TOKEN_PUSH;
+  }
+  else if (sym_size >= 3 && strncmp(opcode, "POP", 3) == 0)
+  {
+    offset = 3;
+    type   = TOKEN_POP;
+  }
+  else if (sym_size >= 3 && strncmp(opcode, "MOV", 3) == 0)
+  {
+    offset = 3;
+    type   = TOKEN_MOV;
+  }
+  else if (sym_size >= 3 && strncmp(opcode, "DUP", 3) == 0)
+  {
+    offset = 3;
+    type   = TOKEN_DUP;
+  }
+  else if (sym_size >= 3 && strncmp(opcode, "NOT", 3) == 0)
+  {
+    offset = 3;
+    type   = TOKEN_NOT;
+  }
+  else if (sym_size >= 2 && strncmp(opcode, "OR", 2) == 0)
+  {
+    offset = 2;
+    type   = TOKEN_OR;
+  }
+  else if (sym_size >= 3 && strncmp(opcode, "AND", 3) == 0)
+  {
+    offset = 3;
+    type   = TOKEN_AND;
+  }
+  else if (sym_size >= 3 && strncmp(opcode, "XOR", 3) == 0)
+  {
+    offset = 3;
+    type   = TOKEN_XOR;
+  }
+  else if (sym_size >= 2 && strncmp(opcode, "EQ", 2) == 0)
+  {
+    offset = 2;
+    type   = TOKEN_EQ;
+  }
+  else if (sym_size >= 3 && strncmp(opcode, "LTE", 3) == 0)
+  {
+    offset = 3;
+    type   = TOKEN_LTE;
+  }
+  else if (sym_size >= 2 && strncmp(opcode, "LT", 2) == 0)
+  {
+    offset = 2;
+    type   = TOKEN_LT;
+  }
+  else if (sym_size >= 3 && strncmp(opcode, "GTE", 3) == 0)
+  {
+    offset = 3;
+    type   = TOKEN_GTE;
+  }
+  else if (sym_size >= 2 && strncmp(opcode, "GT", 2) == 0)
+  {
+    offset = 2;
+    type   = TOKEN_GT;
+  }
+  else if (sym_size >= 4 && strncmp(opcode, "PLUS", 4) == 0)
+  {
+    offset = 4;
+    type   = TOKEN_PLUS;
+  }
+  else if (sym_size >= 5 && strncmp(opcode, "PRINT", 5) == 0)
+  {
+    offset = 5;
+    type   = TOKEN_PRINT;
+  }
+  else if (sym_size >= 7 && strncmp(opcode, "JUMP.IF", 7) == 0)
+  {
+    offset = 7;
+    type   = TOKEN_JUMP_IF;
+  }
+  else if (sym_size >= 4 && strncmp(opcode, "JUMP", 4) == 0)
+  {
+    offset = 4;
+    type   = TOKEN_JUMP;
+  }
+  else
+    is_opcode = false;
+
+  if (!is_opcode)
+  {
+    // Just a symbol, so no further manipulation
+    char *sym = malloc(sym_size + 1);
+    memcpy(sym, opcode, sym_size);
+    sym[sym_size] = '\0';
+    ret           = (token_t){.type     = TOKEN_SYMBOL,
+                              .str      = sym,
+                              .column   = *column,
+                              .str_size = sym_size};
+  }
+  else
+  {
+    ret.type   = type;
+    ret.column = *column;
+    if (offset == sym_size)
+    {
+      // There's no more to the string
+      ret.str    = malloc(1);
+      ret.str[0] = '\0';
+    }
+    else
+    {
+      // t.str is the remaining part of the string after the
+      // opcode
+      ret.str = calloc(sym_size - offset + 1, 1);
+      memcpy(ret.str, opcode + offset, sym_size - offset);
+      ret.str[sym_size - offset] = '\0';
+    }
+    ret.str_size = sym_size - offset;
+  }
+  *column += sym_size;
+  buffer->used += sym_size;
+  return ret;
 }
 
 token_t tokenise_number_literal(buffer_t *buffer, size_t *column)
@@ -98,12 +288,6 @@ token_t tokenise_number_literal(buffer_t *buffer, size_t *column)
   return token;
 }
 
-bool is_valid_hex_char(char c)
-{
-  return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') ||
-         (c >= 'A' && c <= 'F');
-}
-
 token_t tokenise_hex_literal(buffer_t *buffer, size_t *column)
 {
   // For the x part of the literal
@@ -125,7 +309,6 @@ token_t tokenise_hex_literal(buffer_t *buffer, size_t *column)
 
   // Setup the first two characters
   token.str_size += 2;
-  printf("hex_literal: %s, %lu\n", token.str, token.str_size);
   return token;
 }
 
@@ -133,8 +316,9 @@ token_t tokenise_char_literal(buffer_t *buffer, size_t *column)
 {
   token_t token = {
       .type = TOKEN_LITERAL_CHAR, .str_size = 1, .column = *column};
-  token.str    = calloc(1, 1);
+  token.str    = calloc(2, 1);
   token.str[0] = buffer->data[buffer->used + 1];
+  token.str[1] = '\0';
   buffer->used += 3;
   *column += 3;
   return token;
@@ -184,7 +368,146 @@ lerr_t tokenise_buffer(buffer_t *buffer, token_stream_t *tokens_ptr)
              is_valid_hex_char(buffer->data[buffer->used + 1]))
       t = tokenise_hex_literal(buffer, &column);
     else if (is_symbol(c))
-      t = tokenise_symbol(buffer, &column);
+    {
+      static_assert(NUMBER_OF_OPCODES == 70, "tokenise_buffer: Out of date!");
+      token_t token = tokenise_symbol(buffer, &column);
+      char *opcode  = token.str;
+
+      bool is_opcode    = true;
+      token_type_t type = 0;
+      size_t offset     = 0;
+
+      if (token.str_size == 4 && strncmp(opcode, "NOOP", 4) == 0)
+      {
+        offset = 4;
+        type   = TOKEN_NOOP;
+      }
+      else if (token.str_size == 4 && strncmp(opcode, "HALT", 4) == 0)
+      {
+        offset = 4;
+        type   = TOKEN_HALT;
+      }
+      else if (token.str_size >= 8 && strncmp(opcode, "PUSH.REG", 8) == 0)
+      {
+        offset = 8;
+        type   = TOKEN_PUSH;
+      }
+      else if (token.str_size >= 4 && strncmp(opcode, "PUSH", 4) == 0)
+      {
+        offset = 4;
+        type   = TOKEN_PUSH;
+      }
+      else if (token.str_size >= 3 && strncmp(opcode, "POP", 3) == 0)
+      {
+        offset = 3;
+        type   = TOKEN_POP;
+      }
+      else if (token.str_size >= 3 && strncmp(opcode, "MOV", 3) == 0)
+      {
+        offset = 3;
+        type   = TOKEN_MOV;
+      }
+      else if (token.str_size >= 3 && strncmp(opcode, "DUP", 3) == 0)
+      {
+        offset = 3;
+        type   = TOKEN_DUP;
+      }
+      else if (token.str_size >= 3 && strncmp(opcode, "NOT", 3) == 0)
+      {
+        offset = 3;
+        type   = TOKEN_NOT;
+      }
+      else if (token.str_size >= 2 && strncmp(opcode, "OR", 2) == 0)
+      {
+        offset = 2;
+        type   = TOKEN_OR;
+      }
+      else if (token.str_size >= 3 && strncmp(opcode, "AND", 3) == 0)
+      {
+        offset = 3;
+        type   = TOKEN_AND;
+      }
+      else if (token.str_size >= 3 && strncmp(opcode, "XOR", 3) == 0)
+      {
+        offset = 3;
+        type   = TOKEN_XOR;
+      }
+      else if (token.str_size >= 2 && strncmp(opcode, "EQ", 2) == 0)
+      {
+        offset = 2;
+        type   = TOKEN_EQ;
+      }
+      else if (token.str_size >= 3 && strncmp(opcode, "LTE", 3) == 0)
+      {
+        offset = 3;
+        type   = TOKEN_LTE;
+      }
+      else if (token.str_size >= 2 && strncmp(opcode, "LT", 2) == 0)
+      {
+        offset = 2;
+        type   = TOKEN_LT;
+      }
+      else if (token.str_size >= 3 && strncmp(opcode, "GTE", 3) == 0)
+      {
+        offset = 3;
+        type   = TOKEN_GTE;
+      }
+      else if (token.str_size >= 2 && strncmp(opcode, "GT", 2) == 0)
+      {
+        offset = 2;
+        type   = TOKEN_GT;
+      }
+      else if (token.str_size >= 4 && strncmp(opcode, "PLUS", 4) == 0)
+      {
+        offset = 4;
+        type   = TOKEN_PLUS;
+      }
+      else if (token.str_size >= 5 && strncmp(opcode, "PRINT", 5) == 0)
+      {
+        offset = 5;
+        type   = TOKEN_PRINT;
+      }
+      else if (token.str_size >= 1 && strncmp(opcode, "JUMP.IF", 7) == 0)
+      {
+        offset = 7;
+        type   = TOKEN_JUMP_IF;
+      }
+      else if (token.str_size >= 6 && strncmp(opcode, "JUMP", 6) == 0)
+      {
+        offset = 6;
+        type   = TOKEN_JUMP;
+      }
+      else
+      {
+        is_opcode = false;
+        t         = token;
+      }
+
+      if (!is_opcode)
+        // Just a symbol, so no further manipulation
+        t = token;
+      else
+      {
+        t.type   = type;
+        t.column = token.column;
+        if (offset == token.str_size)
+        {
+          // There's no more to the string
+          t.str    = malloc(1);
+          t.str[0] = '\0';
+        }
+        else
+        {
+          // t.str is the remaining part of the string after the
+          // opcode
+          t.str = calloc(token.str_size - offset + 1, 1);
+          memcpy(t.str, token.str + offset, token.str_size - offset);
+          t.str[token.str_size - offset] = '\0';
+        }
+        t.str_size = token.str_size - offset;
+        free(token.str);
+      }
+    }
     else if (c == '\'')
     {
       if (space_left(buffer) < 2)
@@ -222,12 +545,13 @@ lerr_t tokenise_buffer(buffer_t *buffer, token_stream_t *tokens_ptr)
         }
 
         t = (token_t){.type     = TOKEN_LITERAL_CHAR,
-                      .str      = malloc(1),
+                      .str      = malloc(2),
                       .str_size = 1,
                       .column   = column};
-        column += 4;
+        column += 2;
         buffer->used += 4;
         t.str[0] = escape;
+        t.str[1] = '\0';
       }
       else
         t = tokenise_char_literal(buffer, &column);
-- 
cgit v1.2.3-13-gbd6f