Lexer now returns more descriptive tokens

More useful tokens, in particular for each opcode possible. This makes parsing a simpler task to reason as now we're just checking against an enum rather than doing a string check in linear time. It makes more sense to do this at the tokeniser as the local data from the buffer will be in the cache most likely as the buffer is contiguously allocated. While it will always be slow to do linear time checks on strings, when doing it at the parser we're having to check strings that may be allocated in a variety of different places. This means caching becomes a harder task, but with this approach we're less likely to have cache misses as long as the buffer stays there.
author: Aryadev Chavali <aryadev@aryadevchavali.com> 2023-11-01 14:38:59 +0000
committer: Aryadev Chavali <aryadev@aryadevchavali.com> 2023-11-01 15:09:47 +0000
commit: 93d234cd48404867b1d80a727d17a4b4a0726e1b (patch)
tree: b9ff5d75b4e2d6d9b481d149d6103254dbc46d73 /asm/lexer.c
parent: 727081f99a5065787dd3d035f92ef9f502f3622c (diff)
download: ovm-93d234cd48404867b1d80a727d17a4b4a0726e1b.tar.gz
ovm-93d234cd48404867b1d80a727d17a4b4a0726e1b.tar.bz2
ovm-93d234cd48404867b1d80a727d17a4b4a0726e1b.zip
1 files changed, 351 insertions, 27 deletions
diff --git a/asm/lexer.c b/asm/lexer.c
index a02c8e1..a4d9f5a 100644
--- a/asm/lexer.c
+++ b/asm/lexer.c
@@ -10,11 +10,14 @@
  * Description: Lexer for assembly language
  */
 
+#include <assert.h>
 #include <ctype.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <string.h>
 
+#include <lib/inst.h>
+
 #include "./lexer.h"
 
 const char *token_type_as_cstr(token_type_t type)
@@ -25,6 +28,46 @@ const char *token_type_as_cstr(token_type_t type)
     return "LITERAL_NUMBER";
   case TOKEN_LITERAL_CHAR:
     return "LITERAL_CHAR";
+  case TOKEN_NOOP:
+    return "NOOP";
+  case TOKEN_HALT:
+    return "HALT";
+  case TOKEN_PUSH:
+    return "PUSH";
+  case TOKEN_POP:
+    return "POP";
+  case TOKEN_PUSH_REG:
+    return "PUSH_REG";
+  case TOKEN_MOV:
+    return "MOV";
+  case TOKEN_DUP:
+    return "DUP";
+  case TOKEN_NOT:
+    return "NOT";
+  case TOKEN_OR:
+    return "OR";
+  case TOKEN_AND:
+    return "AND";
+  case TOKEN_XOR:
+    return "XOR";
+  case TOKEN_EQ:
+    return "EQ";
+  case TOKEN_LT:
+    return "LT";
+  case TOKEN_LTE:
+    return "LTE";
+  case TOKEN_GT:
+    return "GT";
+  case TOKEN_GTE:
+    return "GTE";
+  case TOKEN_PLUS:
+    return "PLUS";
+  case TOKEN_PRINT:
+    return "PRINT";
+  case TOKEN_JUMP:
+    return "JUMP";
+  case TOKEN_JUMP_IF:
+    return "JUMP_IF";
   case TOKEN_SYMBOL:
     return "SYMBOL";
   }
@@ -52,32 +95,179 @@ size_t space_left(buffer_t *buffer)
   return buffer->available - 1 - buffer->used;
 }
 
+char uppercase(char c)
+{
+  if (c >= 'a' && c <= 'z')
+    return (c - 'a') + 'A';
+  return c;
+}
+
 bool is_symbol(char c)
 {
   return isalpha(c) || c == '-' || c == '_' || c == '.';
 }
 
-char uppercase(char c)
+bool is_valid_hex_char(char c)
 {
-  if (c >= 'a' && c <= 'z')
-    return (c - 'a') + 'A';
-  return c;
+  return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') ||
+         (c >= 'A' && c <= 'F');
 }
 
 token_t tokenise_symbol(buffer_t *buffer, size_t *column)
 {
-  token_t token = {.type = TOKEN_SYMBOL, .str_size = 0, .column = *column};
-  for (; token.str_size < space_left(buffer) &&
-         is_symbol(buffer->data[buffer->used + token.str_size]);
-       ++token.str_size)
-    buffer->data[buffer->used + token.str_size] =
-        uppercase(buffer->data[buffer->used + token.str_size]);
-  token.str = calloc(token.str_size + 1, 1);
-  memcpy(token.str, buffer->data + buffer->used, token.str_size);
-  token.str[token.str_size] = '\0';
-  buffer->used += token.str_size;
-  *column += token.str_size;
-  return token;
+  static_assert(NUMBER_OF_OPCODES == 70, "tokenise_buffer: Out of date!");
+
+  size_t sym_size = 0;
+  for (; sym_size < space_left(buffer) &&
+         is_symbol(buffer->data[buffer->used + sym_size]);
+       ++sym_size)
+    buffer->data[buffer->used + sym_size] =
+        uppercase(buffer->data[buffer->used + sym_size]);
+
+  token_t ret  = {0};
+  char *opcode = (char *)buffer->data + buffer->used;
+
+  bool is_opcode    = true;
+  token_type_t type = 0;
+  size_t offset     = 0;
+
+  if (sym_size == 4 && strncmp(opcode, "NOOP", 4) == 0)
+  {
+    offset = 4;
+    type   = TOKEN_NOOP;
+  }
+  else if (sym_size == 4 && strncmp(opcode, "HALT", 4) == 0)
+  {
+    offset = 4;
+    type   = TOKEN_HALT;
+  }
+  else if (sym_size >= 8 && strncmp(opcode, "PUSH.REG", 8) == 0)
+  {
+    offset = 8;
+    type   = TOKEN_PUSH_REG;
+  }
+  else if (sym_size >= 4 && strncmp(opcode, "PUSH", 4) == 0)
+  {
+    offset = 4;
+    type   = TOKEN_PUSH;
+  }
+  else if (sym_size >= 3 && strncmp(opcode, "POP", 3) == 0)
+  {
+    offset = 3;
+    type   = TOKEN_POP;
+  }
+  else if (sym_size >= 3 && strncmp(opcode, "MOV", 3) == 0)
+  {
+    offset = 3;
+    type   = TOKEN_MOV;
+  }
+  else if (sym_size >= 3 && strncmp(opcode, "DUP", 3) == 0)
+  {
+    offset = 3;
+    type   = TOKEN_DUP;
+  }
+  else if (sym_size >= 3 && strncmp(opcode, "NOT", 3) == 0)
+  {
+    offset = 3;
+    type   = TOKEN_NOT;
+  }
+  else if (sym_size >= 2 && strncmp(opcode, "OR", 2) == 0)
+  {
+    offset = 2;
+    type   = TOKEN_OR;
+  }
+  else if (sym_size >= 3 && strncmp(opcode, "AND", 3) == 0)
+  {
+    offset = 3;
+    type   = TOKEN_AND;
+  }
+  else if (sym_size >= 3 && strncmp(opcode, "XOR", 3) == 0)
+  {
+    offset = 3;
+    type   = TOKEN_XOR;
+  }
+  else if (sym_size >= 2 && strncmp(opcode, "EQ", 2) == 0)
+  {
+    offset = 2;
+    type   = TOKEN_EQ;
+  }
+  else if (sym_size >= 3 && strncmp(opcode, "LTE", 3) == 0)
+  {
+    offset = 3;
+    type   = TOKEN_LTE;
+  }
+  else if (sym_size >= 2 && strncmp(opcode, "LT", 2) == 0)
+  {
+    offset = 2;
+    type   = TOKEN_LT;
+  }
+  else if (sym_size >= 3 && strncmp(opcode, "GTE", 3) == 0)
+  {
+    offset = 3;
+    type   = TOKEN_GTE;
+  }
+  else if (sym_size >= 2 && strncmp(opcode, "GT", 2) == 0)
+  {
+    offset = 2;
+    type   = TOKEN_GT;
+  }
+  else if (sym_size >= 4 && strncmp(opcode, "PLUS", 4) == 0)
+  {
+    offset = 4;
+    type   = TOKEN_PLUS;
+  }
+  else if (sym_size >= 5 && strncmp(opcode, "PRINT", 5) == 0)
+  {
+    offset = 5;
+    type   = TOKEN_PRINT;
+  }
+  else if (sym_size >= 7 && strncmp(opcode, "JUMP.IF", 7) == 0)
+  {
+    offset = 7;
+    type   = TOKEN_JUMP_IF;
+  }
+  else if (sym_size >= 4 && strncmp(opcode, "JUMP", 4) == 0)
+  {
+    offset = 4;
+    type   = TOKEN_JUMP;
+  }
+  else
+    is_opcode = false;
+
+  if (!is_opcode)
+  {
+    // Just a symbol, so no further manipulation
+    char *sym = malloc(sym_size + 1);
+    memcpy(sym, opcode, sym_size);
+    sym[sym_size] = '\0';
+    ret           = (token_t){.type     = TOKEN_SYMBOL,
+                              .str      = sym,
+                              .column   = *column,
+                              .str_size = sym_size};
+  }
+  else
+  {
+    ret.type   = type;
+    ret.column = *column;
+    if (offset == sym_size)
+    {
+      // There's no more to the string
+      ret.str    = malloc(1);
+      ret.str[0] = '\0';
+    }
+    else
+    {
+      // t.str is the remaining part of the string after the
+      // opcode
+      ret.str = calloc(sym_size - offset + 1, 1);
+      memcpy(ret.str, opcode + offset, sym_size - offset);
+      ret.str[sym_size - offset] = '\0';
+    }
+    ret.str_size = sym_size - offset;
+  }
+  *column += sym_size;
+  buffer->used += sym_size;
+  return ret;
 }
 
 token_t tokenise_number_literal(buffer_t *buffer, size_t *column)
@@ -98,12 +288,6 @@ token_t tokenise_number_literal(buffer_t *buffer, size_t *column)
   return token;
 }
 
-bool is_valid_hex_char(char c)
-{
-  return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') ||
-         (c >= 'A' && c <= 'F');
-}
-
 token_t tokenise_hex_literal(buffer_t *buffer, size_t *column)
 {
   // For the x part of the literal
@@ -125,7 +309,6 @@ token_t tokenise_hex_literal(buffer_t *buffer, size_t *column)
 
   // Setup the first two characters
   token.str_size += 2;
-  printf("hex_literal: %s, %lu\n", token.str, token.str_size);
   return token;
 }
 
@@ -133,8 +316,9 @@ token_t tokenise_char_literal(buffer_t *buffer, size_t *column)
 {
   token_t token = {
       .type = TOKEN_LITERAL_CHAR, .str_size = 1, .column = *column};
-  token.str    = calloc(1, 1);
+  token.str    = calloc(2, 1);
   token.str[0] = buffer->data[buffer->used + 1];
+  token.str[1] = '\0';
   buffer->used += 3;
   *column += 3;
   return token;
@@ -184,7 +368,146 @@ lerr_t tokenise_buffer(buffer_t *buffer, token_stream_t *tokens_ptr)
              is_valid_hex_char(buffer->data[buffer->used + 1]))
       t = tokenise_hex_literal(buffer, &column);
     else if (is_symbol(c))
-      t = tokenise_symbol(buffer, &column);
+    {
+      static_assert(NUMBER_OF_OPCODES == 70, "tokenise_buffer: Out of date!");
+      token_t token = tokenise_symbol(buffer, &column);
+      char *opcode  = token.str;
+
+      bool is_opcode    = true;
+      token_type_t type = 0;
+      size_t offset     = 0;
+
+      if (token.str_size == 4 && strncmp(opcode, "NOOP", 4) == 0)
+      {
+        offset = 4;
+        type   = TOKEN_NOOP;
+      }
+      else if (token.str_size == 4 && strncmp(opcode, "HALT", 4) == 0)
+      {
+        offset = 4;
+        type   = TOKEN_HALT;
+      }
+      else if (token.str_size >= 8 && strncmp(opcode, "PUSH.REG", 8) == 0)
+      {
+        offset = 8;
+        type   = TOKEN_PUSH;
+      }
+      else if (token.str_size >= 4 && strncmp(opcode, "PUSH", 4) == 0)
+      {
+        offset = 4;
+        type   = TOKEN_PUSH;
+      }
+      else if (token.str_size >= 3 && strncmp(opcode, "POP", 3) == 0)
+      {
+        offset = 3;
+        type   = TOKEN_POP;
+      }
+      else if (token.str_size >= 3 && strncmp(opcode, "MOV", 3) == 0)
+      {
+        offset = 3;
+        type   = TOKEN_MOV;
+      }
+      else if (token.str_size >= 3 && strncmp(opcode, "DUP", 3) == 0)
+      {
+        offset = 3;
+        type   = TOKEN_DUP;
+      }
+      else if (token.str_size >= 3 && strncmp(opcode, "NOT", 3) == 0)
+      {
+        offset = 3;
+        type   = TOKEN_NOT;
+      }
+      else if (token.str_size >= 2 && strncmp(opcode, "OR", 2) == 0)
+      {
+        offset = 2;
+        type   = TOKEN_OR;
+      }
+      else if (token.str_size >= 3 && strncmp(opcode, "AND", 3) == 0)
+      {
+        offset = 3;
+        type   = TOKEN_AND;
+      }
+      else if (token.str_size >= 3 && strncmp(opcode, "XOR", 3) == 0)
+      {
+        offset = 3;
+        type   = TOKEN_XOR;
+      }
+      else if (token.str_size >= 2 && strncmp(opcode, "EQ", 2) == 0)
+      {
+        offset = 2;
+        type   = TOKEN_EQ;
+      }
+      else if (token.str_size >= 3 && strncmp(opcode, "LTE", 3) == 0)
+      {
+        offset = 3;
+        type   = TOKEN_LTE;
+      }
+      else if (token.str_size >= 2 && strncmp(opcode, "LT", 2) == 0)
+      {
+        offset = 2;
+        type   = TOKEN_LT;
+      }
+      else if (token.str_size >= 3 && strncmp(opcode, "GTE", 3) == 0)
+      {
+        offset = 3;
+        type   = TOKEN_GTE;
+      }
+      else if (token.str_size >= 2 && strncmp(opcode, "GT", 2) == 0)
+      {
+        offset = 2;
+        type   = TOKEN_GT;
+      }
+      else if (token.str_size >= 4 && strncmp(opcode, "PLUS", 4) == 0)
+      {
+        offset = 4;
+        type   = TOKEN_PLUS;
+      }
+      else if (token.str_size >= 5 && strncmp(opcode, "PRINT", 5) == 0)
+      {
+        offset = 5;
+        type   = TOKEN_PRINT;
+      }
+      else if (token.str_size >= 1 && strncmp(opcode, "JUMP.IF", 7) == 0)
+      {
+        offset = 7;
+        type   = TOKEN_JUMP_IF;
+      }
+      else if (token.str_size >= 6 && strncmp(opcode, "JUMP", 6) == 0)
+      {
+        offset = 6;
+        type   = TOKEN_JUMP;
+      }
+      else
+      {
+        is_opcode = false;
+        t         = token;
+      }
+
+      if (!is_opcode)
+        // Just a symbol, so no further manipulation
+        t = token;
+      else
+      {
+        t.type   = type;
+        t.column = token.column;
+        if (offset == token.str_size)
+        {
+          // There's no more to the string
+          t.str    = malloc(1);
+          t.str[0] = '\0';
+        }
+        else
+        {
+          // t.str is the remaining part of the string after the
+          // opcode
+          t.str = calloc(token.str_size - offset + 1, 1);
+          memcpy(t.str, token.str + offset, token.str_size - offset);
+          t.str[token.str_size - offset] = '\0';
+        }
+        t.str_size = token.str_size - offset;
+        free(token.str);
+      }
+    }
     else if (c == '\'')
     {
       if (space_left(buffer) < 2)
@@ -222,12 +545,13 @@ lerr_t tokenise_buffer(buffer_t *buffer, token_stream_t *tokens_ptr)
         }
 
         t = (token_t){.type     = TOKEN_LITERAL_CHAR,
-                      .str      = malloc(1),
+                      .str      = malloc(2),
                       .str_size = 1,
                       .column   = column};
-        column += 4;
+        column += 2;
         buffer->used += 4;
         t.str[0] = escape;
+        t.str[1] = '\0';
       }
       else
         t = tokenise_char_literal(buffer, &column);
author	Aryadev Chavali <aryadev@aryadevchavali.com>	2023-11-01 14:38:59 +0000
committer	Aryadev Chavali <aryadev@aryadevchavali.com>	2023-11-01 15:09:47 +0000
commit	93d234cd48404867b1d80a727d17a4b4a0726e1b (patch)
tree	b9ff5d75b4e2d6d9b481d149d6103254dbc46d73 /asm/lexer.c
parent	727081f99a5065787dd3d035f92ef9f502f3622c (diff)
download	ovm-93d234cd48404867b1d80a727d17a4b4a0726e1b.tar.gz ovm-93d234cd48404867b1d80a727d17a4b4a0726e1b.tar.bz2 ovm-93d234cd48404867b1d80a727d17a4b4a0726e1b.zip