Parser now uses updated lexer

Much simpler, uses a switch case which is a much faster method of doing the parsing. Though roughly equivalent in terms of LOC, I feel that this is more extensible
author: Aryadev Chavali <aryadev@aryadevchavali.com> 2023-11-01 15:09:56 +0000
committer: Aryadev Chavali <aryadev@aryadevchavali.com> 2023-11-01 15:09:56 +0000
commit: 6a270eda1e444a68d5db9102b11af9153fa58a1c (patch)
tree: 6ed880ac17b12c94b3bfa3a656acf3126c80c0eb /asm
parent: 93d234cd48404867b1d80a727d17a4b4a0726e1b (diff)
download: ovm-6a270eda1e444a68d5db9102b11af9153fa58a1c.tar.gz
ovm-6a270eda1e444a68d5db9102b11af9153fa58a1c.tar.bz2
ovm-6a270eda1e444a68d5db9102b11af9153fa58a1c.zip
2 files changed, 164 insertions, 163 deletions
diff --git a/asm/parser.c b/asm/parser.c
index 62f36d8..31cf36c 100644
--- a/asm/parser.c
+++ b/asm/parser.c
@@ -31,6 +31,8 @@ const char *perr_as_cstr(perr_t perr)
     return "NOT_A_NUMBER";
   case PERR_EXPECTED_TYPE:
     return "EXPECTED_TYPE";
+  case PERR_EXPECTED_UTYPE:
+    return "EXPECTED_UTYPE";
   case PERR_EXPECTED_SYMBOL:
     return "EXPECTED_SYMBOL";
   case PERR_EXPECTED_OPERAND:
@@ -42,34 +44,6 @@ const char *perr_as_cstr(perr_t perr)
   }
 }
 
-opcode_t get_typed_opcode(opcode_t base_code, data_type_t type)
-{
-  switch (type)
-  {
-  case DATA_TYPE_BYTE:
-    return base_code;
-  case DATA_TYPE_HWORD:
-    return base_code + 1;
-  case DATA_TYPE_WORD:
-    return base_code + 2;
-  case DATA_TYPE_NIL:
-  default:
-    return 0;
-  }
-}
-
-data_type_t parse_data_type(const char *cstr, size_t length)
-{
-  if (length >= 4 && strncmp(cstr, "BYTE", 4) == 0)
-    return DATA_TYPE_BYTE;
-  else if (length >= 5 && strncmp(cstr, "HWORD", 5) == 0)
-    return DATA_TYPE_HWORD;
-  else if (length >= 4 && strncmp(cstr, "WORD", 4) == 0)
-    return DATA_TYPE_WORD;
-  else
-    return DATA_TYPE_NIL;
-}
-
 perr_t parse_word(token_t token, word *ret)
 {
   if (token.type == TOKEN_LITERAL_NUMBER)
@@ -114,181 +88,206 @@ perr_t parse_word(token_t token, word *ret)
     return PERR_NOT_A_NUMBER;
 }
 
-perr_t parse_inst_with_type(token_stream_t *stream, inst_t *ret,
-                            size_t oplength)
+enum Type
+{
+  T_NIL = -1,
+  T_BYTE,
+  T_CHAR,
+  T_HWORD,
+  T_INT,
+  T_LONG,
+  T_WORD,
+} parse_details_to_type(token_t details)
+{
+  if (details.str_size == 5 && strncmp(details.str, ".BYTE", 5) == 0)
+    return T_BYTE;
+  else if (details.str_size == 5 && strncmp(details.str, ".CHAR", 5) == 0)
+    return T_CHAR;
+  else if (details.str_size == 6 && strncmp(details.str, ".HWORD", 6) == 0)
+    return T_HWORD;
+  else if (details.str_size == 4 && strncmp(details.str, ".INT", 4) == 0)
+    return T_INT;
+  else if (details.str_size == 5 && strncmp(details.str, ".LONG", 5) == 0)
+    return T_LONG;
+  else if (details.str_size == 5 && strncmp(details.str, ".WORD", 5) == 0)
+    return T_WORD;
+  else
+    return T_NIL;
+}
+
+enum UType
+{
+  U_NIL = -1,
+  U_BYTE,
+  U_HWORD,
+  U_WORD,
+} convert_type_to_utype(enum Type type)
+{
+  if (type == T_CHAR || type == T_INT || type == T_LONG)
+    return U_NIL;
+  switch (type)
+  {
+  case T_NIL:
+  case T_LONG:
+  case T_INT:
+  case T_CHAR:
+    return U_NIL;
+  case T_BYTE:
+    return U_BYTE;
+  case T_HWORD:
+    return U_HWORD;
+  case T_WORD:
+    return U_WORD;
+  }
+  return 0;
+}
+
+perr_t parse_utype_inst(token_stream_t *stream, inst_t *ret)
+{
+  if (stream->used + 1 > stream->available)
+    return PERR_EXPECTED_OPERAND;
+  enum UType type = convert_type_to_utype(
+      parse_details_to_type(TOKEN_STREAM_AT(stream->data, stream->used)));
+  if (type == U_NIL)
+    return PERR_EXPECTED_UTYPE;
+  ret->opcode += type;
+  return PERR_OK;
+}
+
+perr_t parse_type_inst(token_stream_t *stream, inst_t *ret)
 {
-  // Assume the base type OP_*_BYTE is in ret->opcode
-  token_t token    = TOKEN_STREAM_AT(stream->data, stream->used);
-  char *opcode     = token.str;
-  data_type_t type = parse_data_type(opcode + oplength,
-                                     WORD_SAFE_SUB(token.str_size, oplength));
-  if (type == DATA_TYPE_NIL)
+  if (stream->used + 1 > stream->available)
+    return PERR_EXPECTED_OPERAND;
+  enum Type type =
+      parse_details_to_type(TOKEN_STREAM_AT(stream->data, stream->used));
+  if (type == T_NIL)
     return PERR_EXPECTED_TYPE;
-  ++stream->used;
-  ret->opcode = get_typed_opcode(ret->opcode, type);
+  ret->opcode += type;
   return PERR_OK;
 }
 
-perr_t parse_inst_with_operand(token_stream_t *stream, inst_t *ret)
+perr_t parse_utype_inst_with_operand(token_stream_t *stream, inst_t *ret)
 {
-  // Parse operand
-  perr_t word_parse_error = parse_word(
-      TOKEN_STREAM_AT(stream->data, stream->used), &ret->operand.as_word);
-  if (word_parse_error)
-    return word_parse_error;
+  perr_t inst_err = parse_utype_inst(stream, ret);
+  if (inst_err)
+    return inst_err;
   ++stream->used;
+  perr_t word_err = parse_word(TOKEN_STREAM_AT(stream->data, stream->used),
+                               &ret->operand.as_word);
+  if (word_err)
+    return word_err;
   return PERR_OK;
 }
 
-perr_t parse_inst_with_typed_operand(token_stream_t *stream, inst_t *ret,
-                                     size_t oplength)
+perr_t parse_type_inst_with_operand(token_stream_t *stream, inst_t *ret)
 {
-  perr_t type_parse_error = parse_inst_with_type(stream, ret, oplength);
-  if (type_parse_error)
-    return type_parse_error;
-
-  // Parse operand
-  perr_t word_parse_error = parse_word(
-      TOKEN_STREAM_AT(stream->data, stream->used), &ret->operand.as_word);
-  if (word_parse_error)
-    return word_parse_error;
+  perr_t inst_err = parse_type_inst(stream, ret);
+  if (inst_err)
+    return inst_err;
   ++stream->used;
+  perr_t word_err = parse_word(TOKEN_STREAM_AT(stream->data, stream->used),
+                               &ret->operand.as_word);
+  if (word_err)
+    return word_err;
   return PERR_OK;
 }
 
 perr_t parse_next_inst(token_stream_t *stream, inst_t *ret)
 {
+  static_assert(NUMBER_OF_OPCODES == 70, "parse_next_inst: Out of date!");
   const token_t token = TOKEN_STREAM_AT(stream->data, stream->used);
-  if (token.type != TOKEN_SYMBOL)
-    return PERR_EXPECTED_SYMBOL;
-  inst_t inst  = {0};
-  char *opcode = token.str;
-  if (token.str_size == 4 && strncmp(opcode, "NOOP", 4) == 0)
-  {
-    inst = INST_NOOP;
-    ++stream->used;
-  }
-  else if (token.str_size == 4 && strncmp(opcode, "HALT", 4) == 0)
-  {
-    inst = INST_HALT;
-    ++stream->used;
-  }
-  else if (token.str_size >= 4 && strncmp(opcode, "PUSH", 4) == 0)
-  {
-    size_t oplen = 5;
-    if (token.str_size >= 8 && strncmp(opcode, "PUSH.REG", 8) == 0)
-    {
-      oplen       = 9;
-      ret->opcode = OP_PUSH_REGISTER_BYTE;
-    }
-    else
-      ret->opcode = OP_PUSH_BYTE;
-    return parse_inst_with_typed_operand(stream, ret, oplen);
-  }
-  else if (token.str_size >= 3 && strncmp(opcode, "POP", 3) == 0)
+  switch (token.type)
   {
+  case TOKEN_LITERAL_NUMBER:
+  case TOKEN_LITERAL_CHAR:
+    return PERR_EXPECTED_SYMBOL;
+  case TOKEN_NOOP:
+    *ret = INST_NOOP;
+    break;
+  case TOKEN_HALT:
+    *ret = INST_HALT;
+    break;
+  case TOKEN_PUSH:
+    ret->opcode = OP_PUSH_BYTE;
+    return parse_utype_inst_with_operand(stream, ret);
+  case TOKEN_POP:
     ret->opcode = OP_POP_BYTE;
-    return parse_inst_with_type(stream, ret, 4);
-  }
-  else if (token.str_size >= 3 && strncmp(opcode, "MOV", 3) == 0)
-  {
+    return parse_utype_inst(stream, ret);
+  case TOKEN_PUSH_REG:
+    ret->opcode = OP_PUSH_REGISTER_BYTE;
+    return parse_utype_inst_with_operand(stream, ret);
+  case TOKEN_MOV:
     ret->opcode = OP_MOV_BYTE;
-    return parse_inst_with_typed_operand(stream, ret, 4);
-  }
-  else if (token.str_size >= 3 && strncmp(opcode, "DUP", 3) == 0)
-  {
+    return parse_utype_inst_with_operand(stream, ret);
+  case TOKEN_DUP:
     ret->opcode = OP_DUP_BYTE;
-    return parse_inst_with_typed_operand(stream, ret, 4);
-  }
-  else if (token.str_size >= 3 && strncmp(opcode, "NOT", 3) == 0)
-  {
+    return parse_utype_inst_with_operand(stream, ret);
+  case TOKEN_NOT:
     ret->opcode = OP_NOT_BYTE;
-    return parse_inst_with_type(stream, ret, 4);
-  }
-  else if (token.str_size >= 2 && strncmp(opcode, "OR", 2) == 0)
-  {
+    return parse_utype_inst(stream, ret);
+  case TOKEN_OR:
     ret->opcode = OP_OR_BYTE;
-    return parse_inst_with_type(stream, ret, 3);
-  }
-  else if (token.str_size >= 3 && strncmp(opcode, "AND", 3) == 0)
-  {
+    return parse_utype_inst(stream, ret);
+  case TOKEN_AND:
     ret->opcode = OP_AND_BYTE;
-    return parse_inst_with_type(stream, ret, 4);
-  }
-  else if (token.str_size >= 3 && strncmp(opcode, "XOR", 3) == 0)
-  {
+    return parse_utype_inst(stream, ret);
+  case TOKEN_XOR:
     ret->opcode = OP_XOR_BYTE;
-    return parse_inst_with_type(stream, ret, 4);
-  }
-  else if (token.str_size >= 2 && strncmp(opcode, "EQ", 2) == 0)
-  {
+    return parse_utype_inst(stream, ret);
+  case TOKEN_EQ:
     ret->opcode = OP_EQ_BYTE;
-    return parse_inst_with_type(stream, ret, 3);
-  }
-  else if (token.str_size >= 4 && strncmp(opcode, "PLUS", 4) == 0)
-  {
+    return parse_utype_inst(stream, ret);
+  case TOKEN_LT:
+    ret->opcode = OP_LT_BYTE;
+    return parse_utype_inst(stream, ret);
+  case TOKEN_LTE:
+    ret->opcode = OP_LTE_BYTE;
+    return parse_utype_inst(stream, ret);
+  case TOKEN_GT:
+    ret->opcode = OP_GT_BYTE;
+    return parse_utype_inst(stream, ret);
+  case TOKEN_GTE:
+    ret->opcode = OP_GTE_BYTE;
+    return parse_utype_inst(stream, ret);
+  case TOKEN_PLUS:
     ret->opcode = OP_PLUS_BYTE;
-    return parse_inst_with_type(stream, ret, 5);
-  }
-  else if (token.str_size >= 6 && strncmp(opcode, "PRINT.", 6) == 0)
-  {
-    const char *type       = opcode + 6;
-    const size_t type_size = WORD_SAFE_SUB(token.str_size, 6);
-    if (type_size == 4 && strncmp(type, "CHAR", 4) == 0)
-      inst.opcode = OP_PRINT_CHAR;
-    else if (type_size == 4 && strncmp(type, "BYTE", 4) == 0)
-      inst.opcode = OP_PRINT_BYTE;
-    else if (type_size == 3 && strncmp(type, "INT", 3) == 0)
-      inst.opcode = OP_PRINT_INT;
-    else if (type_size == 5 && strncmp(type, "HWORD", 5) == 0)
-      inst.opcode = OP_PRINT_HWORD;
-    else if (type_size == 4 && strncmp(type, "LONG", 4) == 0)
-      inst.opcode = OP_PRINT_LONG;
-    else if (type_size == 4 && strncmp(type, "WORD", 4) == 0)
-      inst.opcode = OP_PRINT_WORD;
-    else
-      return PERR_UNKNOWN_OPERATOR;
-    ++stream->used;
-  }
-  else if (token.str_size >= 5 && strncmp(opcode, "JUMP.", 5) == 0)
-  {
-    const char *type       = opcode + 5;
-    const size_t type_size = WORD_SAFE_SUB(token.str_size, 5);
-    if (type_size == 3 && strncmp(type, "ABS", 3) == 0)
+    return parse_utype_inst(stream, ret);
+  case TOKEN_PRINT:
+    ret->opcode = OP_PRINT_BYTE;
+    return parse_type_inst(stream, ret);
+  case TOKEN_JUMP: {
+    if (token.str_size == 4 && strncmp(token.str, ".ABS", 4) == 0)
     {
       ret->opcode = OP_JUMP_ABS;
       ++stream->used;
-      return parse_inst_with_operand(stream, ret);
+      if (stream->used >= stream->available)
+        return PERR_EXPECTED_OPERAND;
+      return parse_word(TOKEN_STREAM_AT(stream->data, stream->used),
+                        &ret->operand.as_word);
     }
-    else if (type_size == 5 && strncmp(type, "STACK", 5) == 0)
-      inst.opcode = OP_JUMP_STACK;
-    else if (type_size == 8 && strncmp(type, "REGISTER", 8) == 0)
+    else if (token.str_size == 9 && strncmp(token.str, ".REGISTER", 9) == 0)
     {
       ret->opcode = OP_JUMP_REGISTER;
       ++stream->used;
-      return parse_inst_with_operand(stream, ret);
-    }
-    else if (type_size >= 2 && strncmp(type, "IF", 2) == 0)
-    {
-      // Parse a typed operand JUMP.IF.<TYPE>
-      token_t prev    = TOKEN_STREAM_AT(stream->data, stream->used);
-      size_t prev_ptr = stream->used;
-
-      TOKEN_STREAM_AT(stream->data, stream->used).str      = (char *)type;
-      TOKEN_STREAM_AT(stream->data, stream->used).str_size = type_size;
-      ret->opcode                                          = OP_JUMP_IF_BYTE;
-      perr_t perr = parse_inst_with_typed_operand(stream, ret, 3);
-
-      TOKEN_STREAM_AT(stream->data, prev_ptr) = prev;
-      return perr;
+      if (stream->used >= stream->available)
+        return PERR_EXPECTED_OPERAND;
+      return parse_word(TOKEN_STREAM_AT(stream->data, stream->used),
+                        &ret->operand.as_word);
     }
+    else if (token.str_size == 6 && strncmp(token.str, ".STACK", 6) == 0)
+      ret->opcode = OP_JUMP_STACK;
     else
       return PERR_UNKNOWN_OPERATOR;
-    ++stream->used;
+    break;
   }
-  else
+  case TOKEN_JUMP_IF: {
+    ret->opcode = OP_JUMP_IF_BYTE;
+    return parse_utype_inst_with_operand(stream, ret);
+  }
+  case TOKEN_SYMBOL:
+  default:
     return PERR_UNKNOWN_OPERATOR;
-  *ret = inst;
+  }
   return PERR_OK;
 }
 
@@ -306,6 +305,7 @@ perr_t parse_stream(token_stream_t *stream, inst_t **ret, size_t *size)
       return err;
     }
     darr_append_bytes(&instructions, (byte *)&inst, sizeof(inst_t));
+    ++stream->used;
   }
   *size = instructions.used / sizeof(inst_t);
   *ret  = (inst_t *)instructions.data;
diff --git a/asm/parser.h b/asm/parser.h
index 1921c7e..ee12b40 100644
--- a/asm/parser.h
+++ b/asm/parser.h
@@ -22,6 +22,7 @@ typedef enum
   PERR_OK = 0,
   PERR_INTEGER_OVERFLOW,
   PERR_NOT_A_NUMBER,
+  PERR_EXPECTED_UTYPE,
   PERR_EXPECTED_TYPE,
   PERR_EXPECTED_SYMBOL,
   PERR_EXPECTED_OPERAND,
author	Aryadev Chavali <aryadev@aryadevchavali.com>	2023-11-01 15:09:56 +0000
committer	Aryadev Chavali <aryadev@aryadevchavali.com>	2023-11-01 15:09:56 +0000
commit	6a270eda1e444a68d5db9102b11af9153fa58a1c (patch)
tree	6ed880ac17b12c94b3bfa3a656acf3126c80c0eb /asm
parent	93d234cd48404867b1d80a727d17a4b4a0726e1b (diff)
download	ovm-6a270eda1e444a68d5db9102b11af9153fa58a1c.tar.gz ovm-6a270eda1e444a68d5db9102b11af9153fa58a1c.tar.bz2 ovm-6a270eda1e444a68d5db9102b11af9153fa58a1c.zip