From 4990d93a1c333c032e5ee0f1fc4aa3a02a7d41fc Mon Sep 17 00:00:00 2001
From: Aryadev Chavali <aryadev@aryadevchavali.com>
Date: Thu, 2 Nov 2023 20:31:55 +0000
Subject: Created a preprocessing unit presult_t and a function to process them

Essentially a presult_t contains one of these:

1) A label construction, which stores the label symbol into
`label` (PRES_LABEL)

2) An instruction that calls upon a label, storing the instruction
in `instruction` and the label name in `label` (PRES_LABEL_ADDRESS)

3) An instruction that uses a relative address offset, storing the
instruction in `instruction` and the offset wanted into
`relative_address` (PRES_RELATIVE_ADDRESS)

4) An instruction that requires no further processing, storing the
instruction into `instruction` (PRES_COMPLETE_INSTRUCTION)

In the processing stage, we resolve all calls by iterating one by one
and maintaining an absolute instruction address.  Pretty nice, lots
more machinery involved in parsing now.
---
 asm/parser.c | 376 ++++++++++++++++++++++++++++++++++++++++++++++++-----------
 asm/parser.h |  19 ++-
 2 files changed, 323 insertions(+), 72 deletions(-)

(limited to 'asm')

diff --git a/asm/parser.c b/asm/parser.c
index 08e067c..f9eb975 100644
--- a/asm/parser.c
+++ b/asm/parser.c
@@ -37,6 +37,10 @@ const char *perr_as_cstr(perr_t perr)
     return "EXPECTED_SYMBOL";
   case PERR_EXPECTED_OPERAND:
     return "EXPECTED_OPERAND";
+  case PERR_UNKNOWN_LABEL:
+    return "UNKNOWN_LABEL";
+  case PERR_INVALID_RELATIVE_ADDRESS:
+    return "INVALID_RELATIVE_ADDRESS";
   case PERR_UNKNOWN_OPERATOR:
     return "UNKNOWN_OPERATOR";
   default:
@@ -88,6 +92,60 @@ perr_t parse_word(token_t token, word *ret)
     return PERR_NOT_A_NUMBER;
 }
 
+perr_t parse_sword(token_t token, i64 *ret)
+{
+  if (token.type == TOKEN_LITERAL_NUMBER)
+  {
+    char *end = NULL;
+    s_word i  = strtoll(token.str, &end, 0);
+    if (!(end && end[0] == '\0'))
+      return PERR_NOT_A_NUMBER;
+    else if (errno == ERANGE)
+    {
+      errno = 0;
+      return PERR_INTEGER_OVERFLOW;
+    }
+    *ret = i;
+    return PERR_OK;
+  }
+  else if (token.type == TOKEN_LITERAL_CHAR)
+  {
+    *ret = token.str[0];
+    return PERR_OK;
+  }
+  else
+    return PERR_NOT_A_NUMBER;
+}
+
+perr_t parse_word_label_or_relative(token_stream_t *stream, presult_t *res)
+{
+  token_t token = TOKEN_STREAM_AT(stream->data, stream->used);
+  if (token.type == TOKEN_SYMBOL)
+  {
+    res->type  = PRES_LABEL_ADDRESS;
+    res->label = calloc(token.str_size + 1, 1);
+    memcpy(res->label, token.str, token.str_size);
+    res->label[token.str_size] = '\0';
+    return PERR_OK;
+  }
+  else if (token.type == TOKEN_LITERAL_CHAR ||
+           token.type == TOKEN_LITERAL_NUMBER)
+  {
+    res->type = PRES_COMPLETE_RESULT;
+    return parse_word(token, &res->instruction.operand.as_word);
+  }
+  else if (token.type == TOKEN_STAR)
+  {
+    if (stream->used + 1 >= stream->available)
+      return PERR_EXPECTED_OPERAND;
+    res->type = PRES_RELATIVE_ADDRESS;
+    ++stream->used;
+    return parse_sword(TOKEN_STREAM_AT(stream->data, stream->used),
+                       &res->relative_address);
+  }
+  return PERR_EXPECTED_OPERAND;
+}
+
 enum Type
 {
   T_NIL = -1,
@@ -179,6 +237,18 @@ perr_t parse_utype_inst_with_operand(token_stream_t *stream, inst_t *ret)
   return PERR_OK;
 }
 
+perr_t parse_jump_inst_operand(token_stream_t *stream, presult_t *res)
+{
+  perr_t inst_err = parse_utype_inst(stream, &res->instruction);
+  if (inst_err)
+    return inst_err;
+  ++stream->used;
+  perr_t op_err = parse_word_label_or_relative(stream, res);
+  if (op_err)
+    return op_err;
+  return PERR_OK;
+}
+
 perr_t parse_type_inst_with_operand(token_stream_t *stream, inst_t *ret)
 {
   perr_t inst_err = parse_type_inst(stream, ret);
@@ -192,151 +262,315 @@ perr_t parse_type_inst_with_operand(token_stream_t *stream, inst_t *ret)
   return PERR_OK;
 }
 
-perr_t parse_next_inst(token_stream_t *stream, inst_t *ret)
+perr_t parse_next(token_stream_t *stream, presult_t *ret)
 {
   const token_t token = TOKEN_STREAM_AT(stream->data, stream->used);
+  perr_t perr         = PERR_OK;
   switch (token.type)
   {
   case TOKEN_LITERAL_NUMBER:
   case TOKEN_LITERAL_CHAR:
     return PERR_EXPECTED_SYMBOL;
   case TOKEN_NOOP:
-    *ret = INST_NOOP;
+    *ret = (presult_t){.instruction = INST_NOOP, .type = PRES_COMPLETE_RESULT};
     break;
   case TOKEN_HALT:
-    *ret = INST_HALT;
+    *ret = (presult_t){.instruction = INST_HALT, .type = PRES_COMPLETE_RESULT};
     break;
   case TOKEN_PUSH:
-    ret->opcode = OP_PUSH_BYTE;
-    return parse_utype_inst_with_operand(stream, ret);
+    *ret = (presult_t){.instruction = INST_PUSH(BYTE, 0),
+                       .type        = PRES_COMPLETE_RESULT};
+    perr = parse_utype_inst_with_operand(stream, &ret->instruction);
+    break;
   case TOKEN_POP:
-    ret->opcode = OP_POP_BYTE;
-    return parse_utype_inst(stream, ret);
+    *ret = (presult_t){.instruction = INST_POP(BYTE),
+                       .type        = PRES_COMPLETE_RESULT};
+    perr = parse_utype_inst(stream, &ret->instruction);
+    break;
   case TOKEN_PUSH_REG:
-    ret->opcode = OP_PUSH_REGISTER_BYTE;
-    return parse_utype_inst_with_operand(stream, ret);
+    *ret = (presult_t){.instruction = INST_PUSH_REG(BYTE, 0),
+                       .type        = PRES_COMPLETE_RESULT};
+    perr = parse_utype_inst_with_operand(stream, &ret->instruction);
+    break;
   case TOKEN_MOV:
-    ret->opcode = OP_MOV_BYTE;
-    return parse_utype_inst_with_operand(stream, ret);
+    *ret = (presult_t){.instruction = INST_MOV(BYTE, 0),
+                       .type        = PRES_COMPLETE_RESULT};
+    perr = parse_utype_inst_with_operand(stream, &ret->instruction);
+    break;
   case TOKEN_DUP:
-    ret->opcode = OP_DUP_BYTE;
-    return parse_utype_inst_with_operand(stream, ret);
+    *ret = (presult_t){.instruction = INST_DUP(BYTE, 0),
+                       .type        = PRES_COMPLETE_RESULT};
+    perr = parse_utype_inst_with_operand(stream, &ret->instruction);
+    break;
   case TOKEN_MALLOC:
-    ret->opcode = OP_MALLOC_BYTE;
-    return parse_utype_inst_with_operand(stream, ret);
+    *ret = (presult_t){.instruction = INST_MALLOC(BYTE, 0),
+                       .type        = PRES_COMPLETE_RESULT};
+    perr = parse_utype_inst_with_operand(stream, &ret->instruction);
+    break;
   case TOKEN_MSET:
-    ret->opcode = OP_MSET_BYTE;
-    return parse_utype_inst_with_operand(stream, ret);
+    *ret = (presult_t){.instruction = INST_MSET(BYTE, 0),
+                       .type        = PRES_COMPLETE_RESULT};
+    perr = parse_utype_inst_with_operand(stream, &ret->instruction);
+    break;
   case TOKEN_MGET:
-    ret->opcode = OP_MGET_BYTE;
-    return parse_utype_inst_with_operand(stream, ret);
+    *ret = (presult_t){.instruction = INST_MGET(BYTE, 0),
+                       .type        = PRES_COMPLETE_RESULT};
+    perr = parse_utype_inst_with_operand(stream, &ret->instruction);
+    break;
   case TOKEN_MALLOC_STACK:
-    ret->opcode = OP_MALLOC_STACK_BYTE;
-    return parse_utype_inst(stream, ret);
+    *ret = (presult_t){.instruction = INST_MALLOC_STACK(BYTE),
+                       .type        = PRES_COMPLETE_RESULT};
+    perr = parse_utype_inst(stream, &ret->instruction);
+    break;
   case TOKEN_MSET_STACK:
-    ret->opcode = OP_MSET_STACK_BYTE;
-    return parse_utype_inst(stream, ret);
+    *ret = (presult_t){.instruction = INST_MSET_STACK(BYTE),
+                       .type        = PRES_COMPLETE_RESULT};
+    perr = parse_utype_inst(stream, &ret->instruction);
+    break;
   case TOKEN_MGET_STACK:
-    ret->opcode = OP_MGET_STACK_BYTE;
-    return parse_utype_inst(stream, ret);
+    *ret = (presult_t){.instruction = INST_MGET_STACK(BYTE),
+                       .type        = PRES_COMPLETE_RESULT};
+    perr = parse_utype_inst(stream, &ret->instruction);
+    break;
   case TOKEN_MDELETE:
-    ret->opcode = OP_MDELETE;
+    *ret =
+        (presult_t){.instruction = INST_MDELETE, .type = PRES_COMPLETE_RESULT};
     break;
   case TOKEN_MSIZE:
-    ret->opcode = OP_MSIZE;
+    *ret = (presult_t){.instruction = INST_MSIZE, .type = PRES_COMPLETE_RESULT};
     break;
   case TOKEN_NOT:
-    ret->opcode = OP_NOT_BYTE;
-    return parse_utype_inst(stream, ret);
+    *ret = (presult_t){.instruction = INST_NOT(BYTE),
+                       .type        = PRES_COMPLETE_RESULT};
+    perr = parse_utype_inst(stream, &ret->instruction);
+    break;
   case TOKEN_OR:
-    ret->opcode = OP_OR_BYTE;
-    return parse_utype_inst(stream, ret);
+    *ret =
+        (presult_t){.instruction = INST_OR(BYTE), .type = PRES_COMPLETE_RESULT};
+    perr = parse_utype_inst(stream, &ret->instruction);
+    break;
   case TOKEN_AND:
-    ret->opcode = OP_AND_BYTE;
-    return parse_utype_inst(stream, ret);
+    *ret = (presult_t){.instruction = INST_AND(BYTE),
+                       .type        = PRES_COMPLETE_RESULT};
+    perr = parse_utype_inst(stream, &ret->instruction);
+    break;
   case TOKEN_XOR:
-    ret->opcode = OP_XOR_BYTE;
-    return parse_utype_inst(stream, ret);
+    *ret = (presult_t){.instruction = INST_XOR(BYTE),
+                       .type        = PRES_COMPLETE_RESULT};
+    perr = parse_utype_inst(stream, &ret->instruction);
+    break;
   case TOKEN_EQ:
-    ret->opcode = OP_EQ_BYTE;
-    return parse_utype_inst(stream, ret);
+    *ret =
+        (presult_t){.instruction = INST_EQ(BYTE), .type = PRES_COMPLETE_RESULT};
+    perr = parse_utype_inst(stream, &ret->instruction);
+    break;
   case TOKEN_LT:
-    ret->opcode = OP_LT_BYTE;
-    return parse_type_inst(stream, ret);
+    *ret =
+        (presult_t){.instruction = INST_LT(BYTE), .type = PRES_COMPLETE_RESULT};
+    perr = parse_type_inst(stream, &ret->instruction);
+    break;
   case TOKEN_LTE:
-    ret->opcode = OP_LTE_BYTE;
-    return parse_type_inst(stream, ret);
+    *ret = (presult_t){.instruction = INST_LTE(BYTE),
+                       .type        = PRES_COMPLETE_RESULT};
+    perr = parse_type_inst(stream, &ret->instruction);
+    break;
   case TOKEN_GT:
-    ret->opcode = OP_GT_BYTE;
-    return parse_type_inst(stream, ret);
+    *ret =
+        (presult_t){.instruction = INST_GT(BYTE), .type = PRES_COMPLETE_RESULT};
+    perr = parse_type_inst(stream, &ret->instruction);
+    break;
   case TOKEN_GTE:
-    ret->opcode = OP_GTE_BYTE;
-    return parse_type_inst(stream, ret);
+    *ret = (presult_t){.instruction = INST_GTE(BYTE),
+                       .type        = PRES_COMPLETE_RESULT};
+    perr = parse_type_inst(stream, &ret->instruction);
+    break;
   case TOKEN_PLUS:
-    ret->opcode = OP_PLUS_BYTE;
-    return parse_utype_inst(stream, ret);
+    *ret = (presult_t){.instruction = INST_PLUS(BYTE),
+                       .type        = PRES_COMPLETE_RESULT};
+    perr = parse_utype_inst(stream, &ret->instruction);
+    break;
   case TOKEN_SUB:
-    ret->opcode = OP_SUB_BYTE;
-    return parse_utype_inst(stream, ret);
+    *ret = (presult_t){.instruction = INST_SUB(BYTE),
+                       .type        = PRES_COMPLETE_RESULT};
+    perr = parse_utype_inst(stream, &ret->instruction);
+    break;
   case TOKEN_MULT:
-    ret->opcode = OP_MULT_BYTE;
-    return parse_utype_inst(stream, ret);
+    *ret = (presult_t){.instruction = INST_MULT(BYTE),
+                       .type        = PRES_COMPLETE_RESULT};
+    perr = parse_utype_inst(stream, &ret->instruction);
+    break;
   case TOKEN_PRINT:
-    ret->opcode = OP_PRINT_BYTE;
-    return parse_type_inst(stream, ret);
+    *ret = (presult_t){.instruction = INST_PRINT(BYTE),
+                       .type        = PRES_COMPLETE_RESULT};
+    perr = parse_type_inst(stream, &ret->instruction);
+    break;
   case TOKEN_JUMP: {
     if (token.str_size == 4 && strncmp(token.str, ".ABS", 4) == 0)
     {
-      ret->opcode = OP_JUMP_ABS;
+      *ret = (presult_t){.instruction = INST_JUMP_ABS(0)};
       ++stream->used;
       if (stream->used >= stream->available)
         return PERR_EXPECTED_OPERAND;
-      return parse_word(TOKEN_STREAM_AT(stream->data, stream->used),
-                        &ret->operand.as_word);
+      return parse_word_label_or_relative(stream, ret);
     }
     else if (token.str_size == 9 && strncmp(token.str, ".REGISTER", 9) == 0)
     {
-      ret->opcode = OP_JUMP_REGISTER;
+      *ret = (presult_t){.instruction = INST_JUMP_REGISTER(0),
+                         .type        = PRES_COMPLETE_RESULT};
       ++stream->used;
       if (stream->used >= stream->available)
         return PERR_EXPECTED_OPERAND;
       return parse_word(TOKEN_STREAM_AT(stream->data, stream->used),
-                        &ret->operand.as_word);
+                        &ret->instruction.operand.as_word);
     }
     else if (token.str_size == 6 && strncmp(token.str, ".STACK", 6) == 0)
-      ret->opcode = OP_JUMP_STACK;
+      *ret = (presult_t){.instruction = INST_JUMP_STACK,
+                         .type        = PRES_COMPLETE_RESULT};
     else
       return PERR_UNKNOWN_OPERATOR;
     break;
   }
   case TOKEN_JUMP_IF: {
-    ret->opcode = OP_JUMP_IF_BYTE;
-    return parse_utype_inst_with_operand(stream, ret);
+    *ret = (presult_t){.instruction = INST_JUMP_IF(BYTE, 0)};
+    return parse_jump_inst_operand(stream, ret);
   }
-  case TOKEN_SYMBOL:
+  case TOKEN_SYMBOL: {
+    size_t label_size = strcspn(token.str, ":");
+    if (label_size == strlen(token.str))
+      return PERR_UNKNOWN_OPERATOR;
+    *ret       = (presult_t){.type = PRES_LABEL};
+    ret->label = calloc(label_size + 1, 1);
+    memcpy(ret->label, token.str, label_size);
+    ret->label[label_size] = '\0';
+    break;
+  }
+  case TOKEN_STAR:
   default:
     return PERR_UNKNOWN_OPERATOR;
   }
+  return perr;
+}
+
+struct LabelPair
+{
+  char *label;
+  size_t label_size;
+  word addr;
+};
+
+perr_t process_presults(presult_t *results, size_t res_count,
+                        inst_t **instructions, size_t *inst_count)
+{
+  darr_t label_pairs = {0};
+  darr_init(&label_pairs, sizeof(struct LabelPair));
+  *inst_count = 0;
+  for (size_t i = 0; i < res_count; ++i)
+  {
+    presult_t res = results[i];
+    switch (res.type)
+    {
+    case PRES_LABEL: {
+      struct LabelPair pair = {0};
+      pair.label            = res.label;
+      pair.addr             = (*inst_count);
+      pair.label_size       = strlen(res.label);
+      darr_append_bytes(&label_pairs, (byte *)&pair, sizeof(pair));
+      break;
+    }
+    case PRES_RELATIVE_ADDRESS: {
+      s_word offset = res.relative_address;
+      if (offset < 0 && ((word)(-offset)) > *inst_count)
+      {
+        free(label_pairs.data);
+        return PERR_INVALID_RELATIVE_ADDRESS;
+      }
+      results[i].instruction.operand.as_word = ((s_word)*inst_count) + offset;
+      (*inst_count)++;
+      break;
+    }
+    case PRES_LABEL_ADDRESS:
+    case PRES_COMPLETE_RESULT:
+    default: {
+      (*inst_count)++;
+      break;
+    }
+    }
+  }
+
+  darr_t instr_darr = {0};
+  darr_init(&instr_darr, sizeof(**instructions));
+  for (size_t i = 0; i < res_count; ++i)
+  {
+    presult_t res = results[i];
+    switch (res.type)
+    {
+    case PRES_LABEL_ADDRESS: {
+      inst_t inst = {0};
+      for (size_t j = 0; j < (label_pairs.used / sizeof(struct LabelPair)); ++j)
+      {
+        struct LabelPair pair = ((struct LabelPair *)label_pairs.data)[j];
+        if (pair.label_size == strlen(res.label) &&
+            strncmp(pair.label, res.label, pair.label_size) == 0)
+        {
+          inst         = res.instruction;
+          inst.operand = DWORD(pair.addr);
+        }
+      }
+
+      if (inst.opcode == OP_NOOP)
+      {
+        free(instr_darr.data);
+        free(label_pairs.data);
+        return PERR_UNKNOWN_LABEL;
+      }
+      darr_append_bytes(&instr_darr, (byte *)&inst, sizeof(inst));
+      break;
+    }
+    case PRES_RELATIVE_ADDRESS:
+    case PRES_COMPLETE_RESULT:
+      darr_append_bytes(&instr_darr, (byte *)&res.instruction,
+                        sizeof(res.instruction));
+    case PRES_LABEL:
+      break;
+    }
+  }
+
+  free(label_pairs.data);
+  *instructions = (inst_t *)instr_darr.data;
   return PERR_OK;
 }
 
 perr_t parse_stream(token_stream_t *stream, inst_t **ret, size_t *size)
 {
-  darr_t instructions = {0};
-  darr_init(&instructions, sizeof(inst_t));
+  darr_t presults = {0};
+  darr_init(&presults, sizeof(presult_t));
   while (stream->used < stream->available)
   {
-    inst_t inst = INST_NOOP;
-    perr_t err  = parse_next_inst(stream, &inst);
+    presult_t pres = {0};
+    perr_t err     = parse_next(stream, &pres);
     if (err)
     {
-      free(instructions.data);
+      for (size_t i = 0; i < (presults.used / sizeof(presult_t)); ++i)
+      {
+        presult_t res = ((presult_t *)presults.data)[i];
+        if (res.type == PRES_LABEL_ADDRESS || res.type == PRES_LABEL)
+          free(res.label);
+      }
+      free(presults.data);
       return err;
     }
-    darr_append_bytes(&instructions, (byte *)&inst, sizeof(inst_t));
+    darr_append_bytes(&presults, (byte *)&pres, sizeof(presult_t));
     ++stream->used;
   }
-  *size = instructions.used / sizeof(inst_t);
-  *ret  = (inst_t *)instructions.data;
-  return PERR_OK;
+
+  perr_t perr = process_presults((presult_t *)presults.data,
+                                 presults.used / sizeof(presult_t), ret, size);
+  for (size_t i = 0; i < (presults.used / sizeof(presult_t)); ++i)
+  {
+    presult_t res = ((presult_t *)presults.data)[i];
+    if (res.type == PRES_LABEL_ADDRESS || res.type == PRES_LABEL)
+      free(res.label);
+  }
+  free(presults.data);
+  return perr;
 }
diff --git a/asm/parser.h b/asm/parser.h
index ee12b40..0a65310 100644
--- a/asm/parser.h
+++ b/asm/parser.h
@@ -27,11 +27,28 @@ typedef enum
   PERR_EXPECTED_SYMBOL,
   PERR_EXPECTED_OPERAND,
   PERR_UNKNOWN_OPERATOR,
+  PERR_INVALID_RELATIVE_ADDRESS,
+  PERR_UNKNOWN_LABEL,
 } perr_t;
 
 const char *perr_as_cstr(perr_t);
 
-perr_t parse_next_inst(token_stream_t *, inst_t *);
+typedef struct
+{
+  inst_t instruction;
+  char *label;
+  s_word relative_address;
+  enum PResult_Type
+  {
+    PRES_LABEL = 0,
+    PRES_LABEL_ADDRESS,
+    PRES_RELATIVE_ADDRESS,
+    PRES_COMPLETE_RESULT,
+  } type;
+} presult_t;
+
+perr_t parse_next(token_stream_t *, presult_t *);
+perr_t process_presults(presult_t *, size_t, inst_t **, size_t *);
 perr_t parse_stream(token_stream_t *, inst_t **, size_t *);
 
 #endif
-- 
cgit v1.2.3-13-gbd6f