1 files changed, 0 insertions, 623 deletions
diff --git a/asm/lexer.c b/asm/lexer.c
deleted file mode 100644
index a4905fb..0000000
--- a/asm/lexer.c
+++ /dev/null
@@ -1,623 +0,0 @@
-/* Copyright (C) 2023 Aryadev Chavali
-
- * You may distribute and modify this code under the terms of the
- * GPLv2 license.  You should have received a copy of the GPLv2
- * license with this file.  If not, please write to:
- * aryadev@aryadevchavali.com.
-
- * Created: 2023-10-24
- * Author: Aryadev Chavali
- * Description: Lexer for assembly language
- */
-
-#include <assert.h>
-#include <ctype.h>
-#include <stdbool.h>
-#include <stdio.h>
-#include <string.h>
-
-#include <lib/inst.h>
-
-#include "./lexer.h"
-
-const char *token_type_as_cstr(token_type_t type)
-{
-  switch (type)
-  {
-  case TOKEN_PP_USE:
-    return "PP_USE";
-  case TOKEN_PP_CONST:
-    return "PP_CONST";
-  case TOKEN_PP_END:
-    return "PP_END";
-  case TOKEN_PP_REFERENCE:
-    return "PP_REFERENCE";
-  case TOKEN_GLOBAL:
-    return "GLOBAL";
-  case TOKEN_STAR:
-    return "STAR";
-  case TOKEN_LITERAL_STRING:
-    return "LITERAL_STRING";
-  case TOKEN_LITERAL_NUMBER:
-    return "LITERAL_NUMBER";
-  case TOKEN_LITERAL_CHAR:
-    return "LITERAL_CHAR";
-  case TOKEN_NOOP:
-    return "NOOP";
-  case TOKEN_HALT:
-    return "HALT";
-  case TOKEN_PUSH:
-    return "PUSH";
-  case TOKEN_POP:
-    return "POP";
-  case TOKEN_PUSH_REG:
-    return "PUSH_REG";
-  case TOKEN_MOV:
-    return "MOV";
-  case TOKEN_DUP:
-    return "DUP";
-  case TOKEN_MALLOC:
-    return "MALLOC";
-  case TOKEN_MALLOC_STACK:
-    return "MALLOC_STACK";
-  case TOKEN_MSET:
-    return "MSET";
-  case TOKEN_MSET_STACK:
-    return "MSET_STACK";
-  case TOKEN_MGET:
-    return "MGET";
-  case TOKEN_MGET_STACK:
-    return "MGET_STACK";
-  case TOKEN_MDELETE:
-    return "MDELETE";
-  case TOKEN_MSIZE:
-    return "MSIZE";
-  case TOKEN_NOT:
-    return "NOT";
-  case TOKEN_OR:
-    return "OR";
-  case TOKEN_AND:
-    return "AND";
-  case TOKEN_XOR:
-    return "XOR";
-  case TOKEN_EQ:
-    return "EQ";
-  case TOKEN_LT:
-    return "LT";
-  case TOKEN_LTE:
-    return "LTE";
-  case TOKEN_GT:
-    return "GT";
-  case TOKEN_GTE:
-    return "GTE";
-  case TOKEN_PLUS:
-    return "PLUS";
-  case TOKEN_SUB:
-    return "SUB";
-  case TOKEN_MULT:
-    return "MULT";
-  case TOKEN_PRINT:
-    return "PRINT";
-  case TOKEN_JUMP_ABS:
-    return "JUMP_ABS";
-  case TOKEN_JUMP_STACK:
-    return "JUMP_STACK";
-  case TOKEN_JUMP_IF:
-    return "JUMP_IF";
-  case TOKEN_CALL:
-    return "CALL";
-  case TOKEN_CALL_STACK:
-    return "CALL_STACK";
-  case TOKEN_RET:
-    return "RET";
-  case TOKEN_SYMBOL:
-    return "SYMBOL";
-  }
-  return "";
-}
-
-const char *lerr_as_cstr(lerr_t lerr)
-{
-  switch (lerr)
-  {
-  case LERR_OK:
-    return "OK";
-  case LERR_INVALID_CHAR_LITERAL:
-    return "INVALID_CHAR_LITERAL";
-  case LERR_INVALID_PREPROCESSOR_DIRECTIVE:
-    return "INVALID_PREPROCESSOR_DIRECTIVE";
-  }
-  return "";
-}
-
-token_t token_copy(token_t t)
-{
-  token_t new = t;
-  new.str     = malloc(t.str_size + 1);
-  memcpy(new.str, t.str, t.str_size);
-  new.str[t.str_size] = '\0';
-  return new;
-}
-
-size_t space_left(buffer_t *buffer)
-{
-  if (buffer->available == buffer->used)
-    return 0;
-  return buffer->available - 1 - buffer->used;
-}
-
-char uppercase(char c)
-{
-  if (c >= 'a' && c <= 'z')
-    return (c - 'a') + 'A';
-  return c;
-}
-
-bool is_symbol(char c)
-{
-  return isalpha(c) || isdigit(c) || c == '-' || c == '_' || c == '.' ||
-         c == ':' || c == '(' || c == ')' || c == '%' || c == '$';
-}
-
-bool is_valid_hex_char(char c)
-{
-  return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') ||
-         (c >= 'A' && c <= 'F');
-}
-
-lerr_t tokenise_symbol(buffer_t *buffer, size_t *column, token_t *token)
-{
-  static_assert(NUMBER_OF_OPCODES == 98, "tokenise_buffer: Out of date!");
-
-  size_t sym_size = 0;
-  for (; sym_size < space_left(buffer) &&
-         is_symbol(buffer->data[buffer->used + sym_size]);
-       ++sym_size)
-    buffer->data[buffer->used + sym_size] =
-        uppercase(buffer->data[buffer->used + sym_size]);
-
-  token_t ret  = {0};
-  char *opcode = (char *)buffer->data + buffer->used;
-
-  bool is_opcode    = true;
-  token_type_t type = 0;
-  size_t offset     = 0;
-
-  if (sym_size > 1 && strncmp(opcode, "%", 1) == 0)
-  {
-    // Some preprocessing directive
-    if (sym_size > 6 && strncmp(opcode + 1, "CONST", 5) == 0)
-    {
-      type   = TOKEN_PP_CONST;
-      offset = 6;
-    }
-    else if (sym_size == 4 && strncmp(opcode + 1, "USE", 3) == 0)
-    {
-      type   = TOKEN_PP_USE;
-      offset = 4;
-    }
-    else if (sym_size == 4 && strncmp(opcode + 1, "END", 3) == 0)
-    {
-      type   = TOKEN_PP_END;
-      offset = 4;
-    }
-    else
-      return LERR_INVALID_PREPROCESSOR_DIRECTIVE;
-  }
-  else if (sym_size > 1 && strncmp(opcode, "$", 1) == 0)
-  {
-    // A reference to a preprocessing constant
-    offset = 1;
-    type   = TOKEN_PP_REFERENCE;
-  }
-  else if (sym_size == 4 && strncmp(opcode, "NOOP", 4) == 0)
-  {
-    offset = 4;
-    type   = TOKEN_NOOP;
-  }
-  else if (sym_size == 4 && strncmp(opcode, "HALT", 4) == 0)
-  {
-    offset = 4;
-    type   = TOKEN_HALT;
-  }
-  else if (sym_size > 9 && strncmp(opcode, "PUSH.REG.", 9) == 0)
-  {
-    offset = 9;
-    type   = TOKEN_PUSH_REG;
-  }
-  else if (sym_size > 5 && strncmp(opcode, "PUSH.", 5) == 0)
-  {
-    offset = 5;
-    type   = TOKEN_PUSH;
-  }
-  else if (sym_size > 4 && strncmp(opcode, "POP.", 4) == 0)
-  {
-    offset = 4;
-    type   = TOKEN_POP;
-  }
-  else if (sym_size > 4 && strncmp(opcode, "MOV.", 4) == 0)
-  {
-    offset = 4;
-    type   = TOKEN_MOV;
-  }
-  else if (sym_size > 4 && strncmp(opcode, "DUP.", 4) == 0)
-  {
-    offset = 4;
-    type   = TOKEN_DUP;
-  }
-  else if (sym_size > 13 && strncmp(opcode, "MALLOC.STACK.", 13) == 0)
-  {
-    offset = 13;
-    type   = TOKEN_MALLOC_STACK;
-  }
-  else if (sym_size > 7 && strncmp(opcode, "MALLOC.", 7) == 0)
-  {
-    offset = 7;
-    type   = TOKEN_MALLOC;
-  }
-  else if (sym_size > 11 && strncmp(opcode, "MSET.STACK.", 11) == 0)
-  {
-    offset = 11;
-    type   = TOKEN_MSET_STACK;
-  }
-  else if (sym_size > 5 && strncmp(opcode, "MSET.", 5) == 0)
-  {
-    offset = 5;
-    type   = TOKEN_MSET;
-  }
-  else if (sym_size > 11 && strncmp(opcode, "MGET.STACK.", 11) == 0)
-  {
-    offset = 11;
-    type   = TOKEN_MGET_STACK;
-  }
-  else if (sym_size > 5 && strncmp(opcode, "MGET.", 5) == 0)
-  {
-    offset = 5;
-    type   = TOKEN_MGET;
-  }
-  else if (sym_size == 7 && strncmp(opcode, "MDELETE", 7) == 0)
-  {
-    offset = 7;
-    type   = TOKEN_MDELETE;
-  }
-  else if (sym_size == 5 && strncmp(opcode, "MSIZE", 5) == 0)
-  {
-    offset = 5;
-    type   = TOKEN_MSIZE;
-  }
-  else if (sym_size > 4 && strncmp(opcode, "NOT.", 4) == 0)
-  {
-    offset = 4;
-    type   = TOKEN_NOT;
-  }
-  else if (sym_size > 3 && strncmp(opcode, "OR.", 3) == 0)
-  {
-    offset = 3;
-    type   = TOKEN_OR;
-  }
-  else if (sym_size > 4 && strncmp(opcode, "AND.", 4) == 0)
-  {
-    offset = 4;
-    type   = TOKEN_AND;
-  }
-  else if (sym_size > 4 && strncmp(opcode, "XOR.", 4) == 0)
-  {
-    offset = 4;
-    type   = TOKEN_XOR;
-  }
-  else if (sym_size >= 3 && strncmp(opcode, "EQ.", 3) == 0)
-  {
-    offset = 3;
-    type   = TOKEN_EQ;
-  }
-  else if (sym_size > 4 && strncmp(opcode, "LTE.", 4) == 0)
-  {
-    offset = 4;
-    type   = TOKEN_LTE;
-  }
-  else if (sym_size > 3 && strncmp(opcode, "LT.", 3) == 0)
-  {
-    offset = 3;
-    type   = TOKEN_LT;
-  }
-  else if (sym_size > 4 && strncmp(opcode, "GTE.", 4) == 0)
-  {
-    offset = 4;
-    type   = TOKEN_GTE;
-  }
-  else if (sym_size > 3 && strncmp(opcode, "GT.", 3) == 0)
-  {
-    offset = 3;
-    type   = TOKEN_GT;
-  }
-  else if (sym_size > 4 && strncmp(opcode, "SUB.", 4) == 0)
-  {
-    offset = 4;
-    type   = TOKEN_SUB;
-  }
-  else if (sym_size > 5 && strncmp(opcode, "PLUS.", 5) == 0)
-  {
-    offset = 5;
-    type   = TOKEN_PLUS;
-  }
-  else if (sym_size > 5 && strncmp(opcode, "MULT.", 5) == 0)
-  {
-    offset = 5;
-    type   = TOKEN_MULT;
-  }
-  else if (sym_size > 6 && strncmp(opcode, "PRINT.", 6) == 0)
-  {
-    offset = 6;
-    type   = TOKEN_PRINT;
-  }
-  else if (sym_size == 8 && strncmp(opcode, "JUMP.ABS", 8) == 0)
-  {
-    offset = 8;
-    type   = TOKEN_JUMP_ABS;
-  }
-  else if (sym_size == 10 && strncmp(opcode, "JUMP.STACK", 10) == 0)
-  {
-    offset = 10;
-    type   = TOKEN_JUMP_STACK;
-  }
-  else if (sym_size > 8 && strncmp(opcode, "JUMP.IF.", 8) == 0)
-  {
-    offset = 8;
-    type   = TOKEN_JUMP_IF;
-  }
-  else if (sym_size == 10 && strncmp(opcode, "CALL.STACK", 10) == 0)
-  {
-    offset = 10;
-    type   = TOKEN_CALL_STACK;
-  }
-  else if (sym_size == 4 && strncmp(opcode, "CALL", 4) == 0)
-  {
-    offset = 4;
-    type   = TOKEN_CALL;
-  }
-  else if (sym_size == 3 && strncmp(opcode, "RET", 3) == 0)
-  {
-    offset = 3;
-    type   = TOKEN_RET;
-  }
-  else if (sym_size == 6 && strncmp(opcode, "GLOBAL", 6) == 0)
-  {
-    offset = 6;
-    type   = TOKEN_GLOBAL;
-  }
-  else
-    is_opcode = false;
-
-  if (!is_opcode)
-  {
-    // Just a symbol, so no further manipulation
-    char *sym = malloc(sym_size + 1);
-    memcpy(sym, opcode, sym_size);
-    sym[sym_size] = '\0';
-    ret           = (token_t){.type     = TOKEN_SYMBOL,
-                              .str      = sym,
-                              .column   = *column,
-                              .str_size = sym_size};
-  }
-  else
-  {
-    ret.type   = type;
-    ret.column = *column;
-    if (offset == sym_size)
-    {
-      // There's no more to the string
-      ret.str    = malloc(1);
-      ret.str[0] = '\0';
-    }
-    else
-    {
-      // t.str is the remaining part of the string after the
-      // opcode
-      ret.str = calloc(sym_size - offset + 1, 1);
-      memcpy(ret.str, opcode + offset, sym_size - offset);
-      ret.str[sym_size - offset] = '\0';
-    }
-    ret.str_size = sym_size - offset;
-  }
-  *column += sym_size - 1;
-  buffer->used += sym_size;
-  *token = ret;
-  return LERR_OK;
-}
-
-token_t tokenise_number_literal(buffer_t *buffer, size_t *column)
-{
-  token_t token = {
-      .type = TOKEN_LITERAL_NUMBER, .str_size = 0, .column = *column};
-  if (buffer->data[buffer->used] == '-')
-    ++token.str_size;
-  for (; token.str_size < space_left(buffer) &&
-         isdigit(buffer->data[buffer->used + token.str_size]);
-       ++token.str_size)
-    continue;
-  token.str = calloc(token.str_size + 1, 1);
-  memcpy(token.str, buffer->data + buffer->used, token.str_size);
-  token.str[token.str_size] = '\0';
-  buffer->used += token.str_size;
-  *column += token.str_size;
-  return token;
-}
-
-token_t tokenise_hex_literal(buffer_t *buffer, size_t *column)
-{
-  // For the x part of the literal
-  ++buffer->used;
-  token_t token = {
-      .type = TOKEN_LITERAL_NUMBER, .str_size = 0, .column = *column};
-  for (; token.str_size < space_left(buffer) &&
-         is_valid_hex_char(buffer->data[buffer->used + token.str_size]);
-       ++token.str_size)
-    continue;
-  // Setup a proper C hex literal
-  token.str    = calloc(token.str_size + 3, 1);
-  token.str[0] = '0';
-  token.str[1] = 'x';
-  memcpy(token.str + 2, buffer->data + buffer->used, token.str_size);
-  token.str[token.str_size + 2] = '\0';
-  buffer->used += token.str_size;
-  *column += token.str_size;
-
-  // Setup the first two characters
-  token.str_size += 2;
-  return token;
-}
-
-token_t tokenise_char_literal(buffer_t *buffer, size_t *column)
-{
-  token_t token = {
-      .type = TOKEN_LITERAL_CHAR, .str_size = 1, .column = *column};
-  token.str    = calloc(2, 1);
-  token.str[0] = buffer->data[buffer->used + 1];
-  token.str[1] = '\0';
-  buffer->used += 3;
-  *column += 3;
-  return token;
-}
-
-token_t tokenise_string_literal(buffer_t *buffer, size_t *column)
-{
-  ++buffer->used;
-  size_t string_size;
-  for (string_size = 0; string_size + buffer->used < buffer->available &&
-                        buffer->data[buffer->used + string_size] != '\"';
-       ++string_size)
-    continue;
-  token_t t = {.type     = TOKEN_LITERAL_STRING,
-               .column   = *column,
-               .str      = malloc(string_size + 1),
-               .str_size = string_size};
-  memcpy(t.str, buffer->data + buffer->used, string_size);
-  t.str[string_size] = '\0';
-  *column += string_size + 1;
-  buffer->used += string_size + 1;
-  return t;
-}
-
-lerr_t tokenise_buffer(buffer_t *buffer, token_stream_t *tokens_ptr)
-{
-  size_t column = 0, line = 1;
-  token_stream_t tokens = {0};
-  darr_init(&tokens, sizeof(token_t));
-  while (space_left(buffer) != 0)
-  {
-    bool is_token = true;
-    token_t t     = {0};
-    char c        = buffer->data[buffer->used];
-    if (isspace(c) || c == '\0')
-    {
-      // Clean whitespace
-      for (; space_left(buffer) > 0 && (isspace(c) || c == '\0');
-           ++buffer->used, c = buffer->data[buffer->used])
-      {
-        ++column;
-        if (c == '\n')
-        {
-          column = 0;
-          ++line;
-        }
-      }
-      ++column;
-      is_token = false;
-    }
-    else if (c == ';')
-    {
-      // Start lexing at next line
-      for (; space_left(buffer) > 0 && c != '\n';
-           ++buffer->used, c = buffer->data[buffer->used])
-        continue;
-      column = 0;
-      ++line;
-      ++buffer->used;
-      is_token = false;
-    }
-    else if (c == '*')
-    {
-      t        = (token_t){.type     = TOKEN_STAR,
-                           .column   = column,
-                           .str      = malloc(1),
-                           .str_size = 1};
-      t.str[0] = '\0';
-      ++buffer->used;
-    }
-    else if (c == '\"')
-      t = tokenise_string_literal(buffer, &column);
-    else if (isdigit(c) || (space_left(buffer) > 1 && c == '-' &&
-                            isdigit(buffer->data[buffer->used + 1])))
-      t = tokenise_number_literal(buffer, &column);
-    else if (c == 'x' && space_left(buffer) > 1 &&
-             is_valid_hex_char(buffer->data[buffer->used + 1]))
-      t = tokenise_hex_literal(buffer, &column);
-    else if (is_symbol(c))
-    {
-      lerr_t lerr = tokenise_symbol(buffer, &column, &t);
-      if (lerr)
-      {
-        free(tokens.data);
-        return lerr;
-      }
-    }
-    else if (c == '\'')
-    {
-      if (space_left(buffer) < 2)
-      {
-        free(tokens.data);
-        return LERR_INVALID_CHAR_LITERAL;
-      }
-      else if (buffer->data[buffer->used + 1] == '\\')
-      {
-        char escape = '\0';
-        if (space_left(buffer) < 3 || buffer->data[buffer->used + 3] != '\'')
-        {
-          free(tokens.data);
-          return LERR_INVALID_CHAR_LITERAL;
-        }
-        switch (buffer->data[buffer->used + 2])
-        {
-        case 'n':
-          escape = '\n';
-          break;
-        case 't':
-          escape = '\t';
-          break;
-        case 'r':
-          escape = '\r';
-          break;
-        case '\\':
-          escape = '\\';
-          break;
-        default:
-          column += 2;
-          free(tokens.data);
-          return LERR_INVALID_CHAR_LITERAL;
-          break;
-        }
-
-        t = (token_t){.type     = TOKEN_LITERAL_CHAR,
-                      .str      = malloc(2),
-                      .str_size = 1,
-                      .column   = column};
-        column += 2;
-        buffer->used += 4;
-        t.str[0] = escape;
-        t.str[1] = '\0';
-      }
-      else
-        t = tokenise_char_literal(buffer, &column);
-    }
-
-    if (is_token)
-    {
-      t.line = line;
-      darr_append_bytes(&tokens, (byte *)&t, sizeof(t));
-    }
-  }
-  tokens.available = tokens.used / sizeof(token_t);
-  tokens.used      = 0;
-  *tokens_ptr      = tokens;
-  return LERR_OK;
-}