Migrate virtual machine from OVM project and rewrite README

2024-04-16 18:21:05 +06:30
parent 2a1d006a88
commit 38d7c13287
15 changed files with 69 additions and 1801 deletions
--- a/.dir-locals.el
+++ b/.dir-locals.el
@@ -7,8 +7,4 @@
 (c-mode . ((flycheck-gcc-include-path          . (".."))
            (flycheck-clang-include-path        . (".."))
            (company-clang-arguments            . ("-I.."))
-            (eval                               . (clang-format-mode t))))
+            (eval                               . (clang-format-mode t)))))
 (c++-mode . ((flycheck-gcc-include-path          . (".."))
              (flycheck-clang-include-path        . (".."))
              (company-clang-arguments            . ("-I.."))
              (eval                               . (clang-format-mode t)))))
--- a/81
+++ b/81
@@ -1,12 +1,10 @@
 CC=gcc
 CPP=g++
 VERBOSE=0
-GENERAL-FLAGS=-Wall -Wextra -Werror -Wswitch-enum -I$(shell pwd)
+GENERAL-FLAGS:=-Wall -Wextra -Werror -Wswitch-enum -I$(shell pwd)
 DEBUG-FLAGS=-ggdb -fsanitize=address
 RELEASE-FLAGS=-O3
 CFLAGS:=$(GENERAL-FLAGS) -std=c11 $(DEBUG-FLAGS) -D VERBOSE=$(VERBOSE)
 CPPFLAGS:=$(GENERAL-FLAGS) -std=c++17 $(DEBUG-FLAGS) -D VERBOSE=$(VERBOSE)
 LIBS=-lm
 DIST=build
@@ -20,102 +18,50 @@ LIB_DIST=$(DIST)/lib
 LIB_SRC=lib
 LIB_CODE:=$(addprefix $(LIB_SRC)/, base.c darr.c heap.c inst.c)
 LIB_OBJECTS:=$(LIB_CODE:$(LIB_SRC)/%.c=$(LIB_DIST)/%.o)
 LIB_CFLAGS=$(CFLAGS)
 ## VM setup
 VM_DIST=$(DIST)/vm
 VM_SRC=vm
 VM_CODE:=$(addprefix $(VM_SRC)/, runtime.c)
 VM_OBJECTS:=$(VM_CODE:$(VM_SRC)/%.c=$(VM_DIST)/%.o)
-VM_CFLAGS:=$(CFLAGS)
+VM_OUT=$(DIST)/avm.out
 VM_OUT=$(DIST)/ovm.out
 ## ASSEMBLY setup
 ASM_DIST=$(DIST)/asm
 ASM_SRC=asm
 ASM_CODE:=$(addprefix $(ASM_SRC)/, base.cpp lexer.cpp preprocesser.cpp)
 ASM_OBJECTS:=$(ASM_CODE:$(ASM_SRC)/%.cpp=$(ASM_DIST)/%.o)
 ASM_CFLAGS=$(CPPFLAGS)
 ASM_OUT=$(DIST)/asm.out
 ## EXAMPLES setup
 EXAMPLES_DIST=$(DIST)/examples
 EXAMPLES_SRC=examples
 EXAMPLES=$(EXAMPLES_DIST)/instruction-test.out $(EXAMPLES_DIST)/fib.out $(EXAMPLES_DIST)/factorial.out $(EXAMPLES_DIST)/memory-print.out
 ## Dependencies
 DEPDIR:=$(DIST)/dependencies
 DEPFLAGS = -MT $@ -MMD -MP -MF
-DEPS:=$($(LIB_SRC):%.c=$(DEPDIR):%.o) $($(ASM_SRC):%.c=$(DEPDIR):%.o) $($(VM_SRC):%.c=$(DEPDIR):%.o)
+DEPS:=$($(LIB_SRC):%.c=$(DEPDIR):%.o) $($(VM_SRC):%.c=$(DEPDIR):%.o)
 # Things you want to build on `make`
-all: $(DIST) lib vm asm examples
+all: $(DIST) lib vm
 lib: $(LIB_OBJECTS)
 vm: $(VM_OUT)
 asm: $(ASM_OUT)
 examples: $(EXAMPLES)
 # Recipes
 ## LIB Recipes
 $(LIB_DIST)/%.o: $(LIB_SRC)/%.c | $(LIB_DIST) $(DEPDIR)/lib
-	@$(CC) $(LIB_CFLAGS) $(DEPFLAGS) $(DEPDIR)/lib/$*.d -c $< -o $@ $(LIBS)
+	@$(CC) $(CFLAGS) $(DEPFLAGS) $(DEPDIR)/lib/$*.d -c $< -o $@ $(LIBS)
 	@echo "$(TERM_YELLOW)$@$(TERM_RESET): $<"
 ## VM Recipes
 $(VM_OUT): $(LIB_OBJECTS) $(VM_OBJECTS) $(VM_DIST)/main.o
-	@$(CC) $(VM_CFLAGS) $^ -o $@ $(LIBS)
+	@$(CC) $(CFLAGS) $^ -o $@ $(LIBS)
 	@echo "$(TERM_GREEN)$@$(TERM_RESET): $^"
 $(VM_DIST)/%.o: $(VM_SRC)/%.c | $(VM_DIST) $(DEPDIR)/vm
-	@$(CC) $(VM_CFLAGS) $(DEPFLAGS) $(DEPDIR)/vm/$*.d -c $< -o $@ $(LIBS)
+	@$(CC) $(CFLAGS) $(DEPFLAGS) $(DEPDIR)/vm/$*.d -c $< -o $@ $(LIBS)
 	@echo "$(TERM_YELLOW)$@$(TERM_RESET): $<"
 ## ASSEMBLY Recipes
 $(ASM_OUT): $(LIB_OBJECTS) $(ASM_OBJECTS) $(ASM_DIST)/main.o
 	@$(CPP) $(ASM_CFLAGS) $^ -o $@ $(LIBS)
 	@echo "$(TERM_GREEN)$@$(TERM_RESET): $^"
 $(ASM_DIST)/%.o: $(ASM_SRC)/%.cpp | $(ASM_DIST) $(DEPDIR)/asm
 	@$(CPP) $(ASM_CFLAGS) $(DEPFLAGS) $(DEPDIR)/asm/$*.d -c $< -o $@ $(LIBS)
 	@echo "$(TERM_YELLOW)$@$(TERM_RESET): $<"
 ## EXAMPLES recipes
 $(EXAMPLES_DIST)/%.out: $(EXAMPLES_SRC)/%.asm $(ASM_OUT) | $(EXAMPLES_DIST)
 	@$(ASM_OUT) $< $@
 	@echo "$(TERM_GREEN)$@$(TERM_RESET): $<"
 .PHONY: run-examples
 run-examples: $(EXAMPLES)
 	@$(foreach example,$(EXAMPLES), echo "$(TERM_YELLOW)$(example)$(TERM_RESET)"; $(MAKE) -s interpret BYTECODE=$(example);)
 OUT=
 ARGS=
 .PHONY: run
-run: $(DIST)/$(OUT)
+run: $(DIST)/$(VM_OUT)
 	./$^ $(ARGS)
 .PHONY: clean
 clean:
 	rm -rfv $(DIST)/*
 SOURCE=
 BYTECODE=
 .PHONY: assemble
 assemble: $(ASM_OUT)
 	@$(ASM_OUT) $(SOURCE) $(BYTECODE)
 .PHONY: interpret
 interpret: $(VM_OUT)
 	@$(VM_OUT) $(BYTECODE)
 .PHONY: exec
 exec: $(ASM_OUT) $(VM_OUT)
 	@$(ASM_OUT) $(SOURCE) $(BYTECODE)
 	@$(VM_OUT) $(BYTECODE)
 # Directories
 $(DIST):
 	mkdir -p $@
@@ -126,18 +72,9 @@ $(LIB_DIST):
 $(VM_DIST):
 	mkdir -p $@
 $(ASM_DIST):
 	mkdir -p $@
 $(EXAMPLES_DIST):
 	mkdir -p $@
 $(DEPDIR)/lib:
 	mkdir -p $@
 $(DEPDIR)/asm:
 	mkdir -p $@
 $(DEPDIR)/vm:
 	mkdir -p $@
--- a/README.org
+++ b/README.org
@@ -1,4 +1,4 @@
-#+title: Oreo's Virtual Machine (OVM)
+#+title: Aryadev's Virtual Machine (AVM)
 #+author: Aryadev Chavali
 #+date: 2023-10-15
@@ -6,18 +6,14 @@ A stack based virtual machine in C11, with a dynamic register setup
 which acts as variable space.  Deals primarily in bytes, doesn't make
 assertions about typing and is very simple to target.
-2024-04-16: Project will now be split into two components
+This repository contains both a library ([[file:lib/][lib folder]]) to
-1) The runtime + base library
+(de)serialize bytecode and a program ([[file:vm/][vm folder]]) to
-2) The assembler
+execute bytecode.
-This will focus each repository on separate issues and make it easier
+Along with this is an
-to organize.  They will both derive from the same repositories
+[[https://github.com/aryadev-software/aal][assembler]] program which
-i.e. I'm not making fresh repositories and just sticking the folders
+can compile an assembly-like language to bytecode.
 in but rather branching this repository into two different versions.
 The two versions will be hosted at:
 1) [[https://github.com/aryadev-software/avm]]
 1) [[https://github.com/aryadev-software/aal]]
 * How to build
 Requires =GNU make= and a compliant C11 compiler.  Code base has been
 tested against =gcc= and =clang=, but given how the project has been
@@ -26,85 +22,70 @@ issue to compile using something like =tcc= or another compiler (look
 at [[file:Makefile::CC=gcc][here]] to change the compiler).
 To build everything simply run ~make~.  This will build:
-+ [[file:lib/inst.c][instruction bytecode system]] which provides
+ [[file:lib/][instruction bytecode system]] which provides object
-  object files to target the VM
+  files to target the VM
-+ [[file:vm/main.c][VM executable]] which executes bytecode
+ [[file:vm/][VM executable]] which executes bytecode
 + [[file:asm/main.c][Assembler executable]] which assembles compliant
  assembly code to VM bytecode
 + [[file:examples/][Assembly examples]] which provide some source code
  examples on common programs one may write.  Use this to figure out
  how to write compliant assembly.  Also a good test of both the VM
  and assembler.
 You may also build each component individually through the
 corresponding recipe:
 + ~make lib~
 + ~make vm~
-+ ~make asm~
+* How to target the virtual machine
-+ ~make examples~
+Link with the object files for [[file:lib/base.c][base.c]] and
-* Instructions to target the virtual machine
+[[file:lib/inst.c][inst.c]] to be able to properly target the virtual
-You need to link with the object files for
+machine.  The general idea is to convert parse units into instances of
-[[file:lib/base.c][base.c]], [[file:lib/darr.c][darr.c]] and
+~inst_t~.  Once a collection of ~inst_t~'s have been made, they must
-[[file:lib/inst.c][inst.c]] to be able to properly target the OVM.
+be wrapped in a ~prog_t~ structure which is a flexibly allocated
-The basic idea is to create some instructions via ~inst_t~,
+structure with two components:
-instantiating a ~prog_t~ structure which wraps those instructions
+1) A program header ~prog_header_t~ with some essential properties of
-(includes a header and other useful things for the runtime), then
+   the program (start address, count, etc)
-using ~prog_write_file~ to serialise and write bytecode to a file
+2) A buffer of type ~inst_t~ which should contain the ordered
-pointer.
+   collection constructed
-To execute directly compiled bytecode use the ~ovm.out~ executable on
+There are two ways to utilise execute this program structure:
-the bytecode file.
+compilation or in memory execution.
 ** Compilation
 The ~prog_t~ structure can be fed to ~prog_write_file~ with a file
 pointer to write well formed =AVM= bytecode into a file.  To execute
 this bytecode, simply use the ~avm.out~ executable with the bytecode
 file name.
-For clarity, one may build ~lib~ (~make lib~) then use the resulting
+This is the classical way I expect languages to target the virtual
-object files to link and create bytecode for the virtual machine.
+machine.
 ** In memory virtual machine
-Instead of serialising and writing bytecode to a file, one may instead
+This method requires linking with [[file:vm/runtime.c]] to be able to
-serialise bytecode in memory using ~prog_write_bytecode~ which writes
+construct a working ~vm_t~ structure.  The steps are:
-bytecode to a dynamic byte buffer, so called *in memory compilation*.
+ Load the stack, heap and call stack into a ~vm_t~ structure
-To execute this bytecode, deserialise the bytecode into a program then
+ Load the ~prog_t~ into the ~vm_t~ (~vm_load_program~)
-load it into a complete ~vm_t~ structure (linking with
+ Execute via ~vm_execute~ or ~vm_execute_all~
 [[file:vm/runtime.c][runtime.c]]).
-In fact, you may skip the process of serialising entirely.  You can
+~vm_execute~ executes the next instruction and stops, while
-emit a ~prog_t~ structure corresponding to source code, load it
+~vm_execute_all~ continues execution till the program halts.  Either
-directly into the ~vm_t~ structure, then execute.  To do so is a bit
+can be useful depending on requirements.
 involved, so I recommend looking at [[file:vm/main.c]].  In rough
 steps:
 + Create a virtual machine "from scratch" (load the necessary
  components (the stack, heap and call stack) by hand)
 + Load program into VM (~vm_load_program~)
 + Run ~vm_execute_all~
-This is recommended if writing an interpreted language such as a Lisp,
+I expect this method to be used for languages that are /interpreted/
-where on demand execution of code is more suitable.
+such as Lisp or Python where /code/ -> /execution/ rather than /code/
 -> /compile unit/ -> /execute unit/, while still providing the ability
 to compile code to a byte code unit.
 * Lines of code
 #+begin_src sh :results table :exports results
 wc -lwc $(find -regex ".*\.[ch]\(pp\)?")
 #+end_src
 #+RESULTS:
-| Files                  | Lines | Words |  Bytes |
+| Files          | Lines | Words | Bytes |
-|------------------------+-------+-------+--------|
+|----------------+-------+-------+-------|
-| ./lib/heap.h           |    42 |   111 |    801 |
+| ./lib/heap.h   |    42 |   111 |   801 |
-| ./lib/inst.c           |   516 |  1315 |  13982 |
+| ./lib/inst.c   |   512 |  1303 | 13936 |
-| ./lib/darr.c           |    77 |   225 |   1757 |
+| ./lib/darr.c   |    77 |   225 |  1757 |
-| ./lib/base.c           |   107 |   306 |   2002 |
+| ./lib/base.c   |   107 |   306 |  2002 |
-| ./lib/inst.h           |   108 |   426 |   4067 |
+| ./lib/inst.h   |   108 |   426 |  4067 |
-| ./lib/prog.h           |   176 |   247 |   2616 |
+| ./lib/prog.h   |   176 |   247 |  2616 |
-| ./lib/base.h           |   148 |   626 |   3915 |
+| ./lib/base.h   |   148 |   626 |  3915 |
-| ./lib/darr.h           |    88 |   465 |   2697 |
+| ./lib/darr.h   |    88 |   465 |  2697 |
-| ./lib/heap.c           |   101 |   270 |   1910 |
+| ./lib/heap.c   |   101 |   270 |  1910 |
-| ./vm/runtime.h         |   301 |   780 |   7965 |
+| ./vm/runtime.h |   301 |   780 |  7965 |
-| ./vm/runtime.c         |  1070 |  3097 |  30010 |
+| ./vm/runtime.c |  1070 |  3097 | 30010 |
-| ./vm/main.c            |    92 |   265 |   2243 |
+| ./vm/main.c    |    92 |   265 |  2243 |
-| ./asm/base.hpp         |    21 |    68 |    472 |
+|----------------+-------+-------+-------|
-| ./asm/lexer.cpp        |   565 |  1448 |  14067 |
+| total          |  2822 |  8121 | 73919 |
 | ./asm/base.cpp         |    33 |    89 |    705 |
 | ./asm/parser.hpp       |    82 |   199 |   1656 |
 | ./asm/parser.cpp       |    42 |   129 |   1294 |
 | ./asm/lexer.hpp        |   106 |   204 |   1757 |
 | ./asm/preprocesser.cpp |   218 |   574 |   5800 |
 | ./asm/preprocesser.hpp |    62 |   147 |   1360 |
 | ./asm/main.cpp         |   148 |   414 |   3791 |
 |------------------------+-------+-------+--------|
 | total                  |  4103 | 11405 | 104867 |
--- a/asm/base.cpp
+++ b/asm/base.cpp
@@ -1,33 +0,0 @@
 /* Copyright (C) 2024 Aryadev Chavali
 * You may distribute and modify this code under the terms of the
 * GPLv2 license.  You should have received a copy of the GPLv2
 * license with this file.  If not, please write to:
 * aryadev@aryadevchavali.com.
 * Created: 2024-04-14
 * Author: Aryadev Chavali
 * Description:
 */
 #include "./base.hpp"
 #include <cstdio>
 std::optional<std::string> read_file(const char *filename)
 {
  FILE *fp = fopen(filename, "rb");
  if (fp)
  {
    std::string contents;
    fseek(fp, 0, SEEK_END);
    contents.resize(ftell(fp));
    rewind(fp);
    fread(&contents[0], 1, contents.size(), fp);
    fclose(fp);
    return contents;
  }
  else
    return std::nullopt;
 }
--- a/asm/base.hpp
+++ b/asm/base.hpp
@@ -1,21 +0,0 @@
 /* Copyright (C) 2024 Aryadev Chavali
 * You may distribute and modify this code under the terms of the
 * GPLv2 license.  You should have received a copy of the GPLv2
 * license with this file.  If not, please write to:
 * aryadev@aryadevchavali.com.
 * Created: 2024-04-14
 * Author: Aryadev Chavali
 * Description: Base library
 */
 #ifndef BASE_HPP
 #define BASE_HPP
 #include <optional>
 #include <string>
 std::optional<std::string> read_file(const char *);
 #endif
--- a/asm/lexer.cpp
+++ b/asm/lexer.cpp
@@ -1,565 +0,0 @@
 /* Copyright (C) 2024 Aryadev Chavali
 * You may distribute and modify this code under the terms of the
 * GPLv2 license.  You should have received a copy of the GPLv2
 * license with this file.  If not, please write to:
 * aryadev@aryadevchavali.com.
 * Created: 2024-04-14
 * Author: Aryadev Chavali
 * Description: Lexer for assembly language
 */
 extern "C"
 {
 #include <lib/inst.h>
 }
 #include <algorithm>
 #include <tuple>
 #include "./lexer.hpp"
 static_assert(NUMBER_OF_OPCODES == 98, "ERROR: Lexer is out of date");
 using std::string, std::string_view, std::pair, std::make_pair;
 const auto VALID_SYMBOL = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUV"
                          "WXYZ0123456789-_.:%#$",
           VALID_DIGIT = "0123456789", VALID_HEX = "0123456789abcdefABCDEF";
 bool is_char_in_s(char c, const char *s)
 {
  return string_view(s).find(c) != string::npos;
 }
 bool initial_match(string_view src, string_view match)
 {
  return (src.size() > match.size() && src.substr(0, match.size()) == match);
 }
 pair<token_t, lerr_t> tokenise_symbol(string_view &source, size_t &column,
                                      size_t line)
 {
  auto end = source.find_first_not_of(VALID_SYMBOL);
  if (end == string::npos)
    end = source.size() - 1;
  string sym{source.substr(0, end)};
  source.remove_prefix(end);
  std::transform(sym.begin(), sym.end(), sym.begin(), ::toupper);
  token_t t{};
  if (sym == "%CONST")
  {
    t.type = token_type_t::PP_CONST;
  }
  else if (sym == "%USE")
  {
    t.type = token_type_t::PP_USE;
  }
  else if (sym == "%END")
  {
    t.type = token_type_t::PP_END;
  }
  else if (sym[0] == '%')
  {
    return make_pair(
        t, lerr_t(lerr_type_t::INVALID_PREPROCESSOR_DIRECTIVE, column, line));
  }
  else if (sym.size() > 1 && sym[0] == '$')
  {
    t = token_t(token_type_t::PP_REFERENCE, sym.substr(1));
  }
  else if (sym == "NOOP")
  {
    t.type = token_type_t::NOOP;
  }
  else if (sym == "HALT")
  {
    t.type = token_type_t::HALT;
  }
  else if (initial_match(sym, "PUSH.REG."))
  {
    t = token_t(token_type_t::PUSH_REG, sym.substr(9));
  }
  else if (initial_match(sym, "PUSH."))
  {
    t = token_t(token_type_t::PUSH, sym.substr(5));
  }
  else if (initial_match(sym, "POP."))
  {
    t = token_t(token_type_t::POP, sym.substr(4));
  }
  else if (initial_match(sym, "MOV."))
  {
    t = token_t(token_type_t::MOV, sym.substr(4));
  }
  else if (initial_match(sym, "DUP."))
  {
    t = token_t(token_type_t::DUP, sym.substr(4));
  }
  else if (initial_match(sym, "MALLOC.STACK."))
  {
    t = token_t(token_type_t::MALLOC_STACK, sym.substr(13));
  }
  else if (initial_match(sym, "MALLOC."))
  {
    t = token_t(token_type_t::MALLOC, sym.substr(7));
  }
  else if (initial_match(sym, "MSET.STACK."))
  {
    t = token_t(token_type_t::MSET_STACK, sym.substr(11));
  }
  else if (initial_match(sym, "MSET."))
  {
    t = token_t(token_type_t::MSET, sym.substr(5));
  }
  else if (initial_match(sym, "MGET.STACK."))
  {
    t = token_t(token_type_t::MGET_STACK, sym.substr(11));
  }
  else if (initial_match(sym, "MGET."))
  {
    t = token_t(token_type_t::MGET, sym.substr(5));
  }
  else if (sym == "MDELETE")
  {
    t.type = token_type_t::MDELETE;
  }
  else if (sym == "MSIZE")
  {
    t.type = token_type_t::MSIZE;
  }
  else if (initial_match(sym, "NOT."))
  {
    t = token_t(token_type_t::NOT, sym.substr(4));
  }
  else if (initial_match(sym, "OR."))
  {
    t = token_t(token_type_t::OR, sym.substr(3));
  }
  else if (initial_match(sym, "AND."))
  {
    t = token_t(token_type_t::AND, sym.substr(4));
  }
  else if (initial_match(sym, "XOR."))
  {
    t = token_t(token_type_t::XOR, sym.substr(4));
  }
  else if (initial_match(sym, "EQ."))
  {
    t = token_t(token_type_t::EQ, sym.substr(3));
  }
  else if (initial_match(sym, "LTE."))
  {
    t = token_t(token_type_t::LTE, sym.substr(4));
  }
  else if (initial_match(sym, "LT."))
  {
    t = token_t(token_type_t::LT, sym.substr(3));
  }
  else if (initial_match(sym, "GTE."))
  {
    t = token_t(token_type_t::GTE, sym.substr(4));
  }
  else if (initial_match(sym, "GT."))
  {
    t = token_t(token_type_t::GT, sym.substr(3));
  }
  else if (initial_match(sym, "SUB."))
  {
    t = token_t(token_type_t::SUB, sym.substr(4));
  }
  else if (initial_match(sym, "PLUS."))
  {
    t = token_t(token_type_t::PLUS, sym.substr(5));
  }
  else if (initial_match(sym, "MULT."))
  {
    t = token_t(token_type_t::MULT, sym.substr(5));
  }
  else if (initial_match(sym, "PRINT."))
  {
    t = token_t(token_type_t::PRINT, sym.substr(6));
  }
  else if (sym == "JUMP.ABS")
  {
    t.type = token_type_t::JUMP_ABS;
  }
  else if (sym == "JUMP.STACK")
  {
    t.type = token_type_t::JUMP_STACK;
  }
  else if (initial_match(sym, "JUMP.IF."))
  {
    t = token_t(token_type_t::JUMP_IF, sym.substr(8));
  }
  else if (sym == "CALL.STACK")
  {
    t.type = token_type_t::CALL_STACK;
  }
  else if (sym == "CALL")
  {
    t.type = token_type_t::CALL;
  }
  else if (sym == "RET")
  {
    t.type = token_type_t::RET;
  }
  else if (sym == "GLOBAL")
  {
    t.type = token_type_t::GLOBAL;
  }
  else
  {
    t.type = token_type_t::SYMBOL;
  }
  if (t.content == "")
    t.content = sym;
  t.column = column;
  column += sym.size() - 1;
  return make_pair(t, lerr_t());
 }
 token_t tokenise_literal_number(string_view &source, size_t &column)
 {
  bool is_negative = false;
  if (source[0] == '-')
  {
    is_negative = true;
    source.remove_prefix(1);
  }
  auto end = source.find_first_not_of(VALID_DIGIT);
  if (end == string::npos)
    end = source.size() - 1;
  string digits{source.substr(0, end)};
  source.remove_prefix(end);
  token_t t{token_type_t::LITERAL_NUMBER, (is_negative ? "-" : "") + digits,
            column};
  column += digits.size() + (is_negative ? 1 : 0);
  return t;
 }
 token_t tokenise_literal_hex(string_view &source, size_t &column)
 {
  // Remove x char from source
  source.remove_prefix(1);
  auto end = source.find_first_not_of(VALID_HEX);
  if (end == string::npos)
    end = source.size() - 1;
  string digits{source.substr(0, end)};
  source.remove_prefix(end);
  token_t t = {token_type_t::LITERAL_NUMBER, "0x" + digits, column};
  column += digits.size() + 1;
  return t;
 }
 pair<token_t, lerr_t> tokenise_literal_char(string_view &source, size_t &column,
                                            size_t &line)
 {
  token_t t{};
  auto end = source.find('\'', 1);
  if (source.size() < 3 || end == 1 || end > 3)
    return make_pair(t,
                     lerr_t(lerr_type_t::INVALID_CHAR_LITERAL, column, line));
  else if (source[1] == '\\')
  {
    // Escape sequence
    char escape = '\0';
    if (source.size() < 4 || source[3] != '\'')
      return make_pair(t,
                       lerr_t(lerr_type_t::INVALID_CHAR_LITERAL_ESCAPE_SEQUENCE,
                              column, line));
    switch (source[2])
    {
    case 'n':
      escape = '\n';
      break;
    case 't':
      escape = '\t';
      break;
    case 'r':
      escape = '\r';
      break;
    case '\\':
      escape = '\\';
      break;
    default:
      column += 2;
      return make_pair(t,
                       lerr_t(lerr_type_t::INVALID_CHAR_LITERAL_ESCAPE_SEQUENCE,
                              column, line));
      break;
    }
    t = token_t{token_type_t::LITERAL_CHAR, std::to_string(escape), column};
    column += 4;
    source.remove_prefix(4);
  }
  else
  {
    t = token_t(token_type_t::LITERAL_CHAR, std::to_string(source[1]));
    column += 3;
    source.remove_prefix(3);
  }
  return make_pair(t, lerr_t());
 }
 token_t tokenise_literal_string(string_view &source, size_t &column, size_t end)
 {
  source.remove_prefix(1);
  token_t token{token_type_t::LITERAL_STRING, string(source.substr(0, end - 1)),
                column};
  source.remove_prefix(end);
  column += end + 1;
  return token;
 }
 lerr_t tokenise_buffer(string_view source, std::vector<token_t *> &tokens)
 {
  size_t column = 0, line = 1;
  while (source.size() > 0)
  {
    bool is_token = true;
    char first    = source[0];
    token_t t{};
    if (isspace(first) || first == '\0')
    {
      size_t i;
      for (i = 0;
           i < source.size() && (isspace(source[i]) || source[i] == '\0'); ++i)
      {
        ++column;
        if (source[i] == '\n')
        {
          column = 0;
          ++line;
        }
      }
      ++column;
      source.remove_prefix(i);
      is_token = false;
    }
    else if (first == ';')
    {
      size_t i;
      for (i = 0; i < source.size() && source[i] != '\n'; ++i)
        continue;
      column = 0;
      ++line;
      source.remove_prefix(i + 1);
      is_token = false;
    }
    else if (first == '*')
    {
      t = token_t(token_type_t::STAR, "", column);
      source.remove_prefix(1);
    }
    else if (first == '\"')
    {
      auto end = source.find('\"', 1);
      if (end == string::npos)
        return lerr_t(lerr_type_t::INVALID_STRING_LITERAL, column, line);
      t = tokenise_literal_string(source, column, end);
    }
    else if (first == '\'')
    {
      lerr_t lerr;
      std::tie(t, lerr) = tokenise_literal_char(source, column, line);
      if (lerr.type != lerr_type_t::OK)
        return lerr;
    }
    else if (isdigit(first) ||
             (source.size() > 1 && first == '-' && isdigit(source[1])))
    {
      auto end = source.find_first_not_of(VALID_DIGIT, first == '-' ? 1 : 0);
      if (end == string::npos)
        end = source.size() - 1;
      else if (end != string::npos && !(isspace(source[end])))
        return lerr_t(lerr_type_t::INVALID_NUMBER_LITERAL, column, line);
      t = tokenise_literal_number(source, column);
    }
    else if (first == '0' && source.size() > 2 && source[1] == 'x' &&
             is_char_in_s(source[2], VALID_HEX))
    {
      auto end = source.find_first_not_of(VALID_HEX);
      if (end == string::npos)
        end = source.size() - 1;
      else if (end != string::npos && !(isspace(source[end])))
        return lerr_t(lerr_type_t::INVALID_NUMBER_LITERAL, column, line);
      t = tokenise_literal_hex(source, column);
    }
    else if (is_char_in_s(first, VALID_SYMBOL))
    {
      lerr_t lerr;
      std::tie(t, lerr) = tokenise_symbol(source, column, line);
      if (lerr.type != lerr_type_t::OK)
        return lerr;
    }
    else
    {
      ++column;
      return lerr_t{lerr_type_t::UNKNOWN_LEXEME, column, line};
    }
    if (is_token)
    {
      t.line       = line;
      token_t *acc = new token_t(t);
      tokens.push_back(acc);
    }
  }
  return lerr_t{};
 }
 std::ostream &operator<<(std::ostream &os, token_t &t)
 {
  return os << token_type_as_cstr(t.type) << "(`" << t.content << "`)@"
            << t.line << ", " << t.column;
 }
 token_t::token_t()
 {}
 token_t::token_t(token_type_t type, string content, size_t col, size_t line)
    : type{type}, column{col}, line{line}, content{content}
 {}
 const char *token_type_as_cstr(token_type_t type)
 {
  switch (type)
  {
  case token_type_t::PP_USE:
    return "PP_USE";
  case token_type_t::PP_CONST:
    return "PP_CONST";
  case token_type_t::PP_END:
    return "PP_END";
  case token_type_t::PP_REFERENCE:
    return "PP_REFERENCE";
  case token_type_t::GLOBAL:
    return "GLOBAL";
  case token_type_t::STAR:
    return "STAR";
  case token_type_t::LITERAL_STRING:
    return "LITERAL_STRING";
  case token_type_t::LITERAL_NUMBER:
    return "LITERAL_NUMBER";
  case token_type_t::LITERAL_CHAR:
    return "LITERAL_CHAR";
  case token_type_t::NOOP:
    return "NOOP";
  case token_type_t::HALT:
    return "HALT";
  case token_type_t::PUSH:
    return "PUSH";
  case token_type_t::POP:
    return "POP";
  case token_type_t::PUSH_REG:
    return "PUSH_REG";
  case token_type_t::MOV:
    return "MOV";
  case token_type_t::DUP:
    return "DUP";
  case token_type_t::MALLOC:
    return "MALLOC";
  case token_type_t::MALLOC_STACK:
    return "MALLOC_STACK";
  case token_type_t::MSET:
    return "MSET";
  case token_type_t::MSET_STACK:
    return "MSET_STACK";
  case token_type_t::MGET:
    return "MGET";
  case token_type_t::MGET_STACK:
    return "MGET_STACK";
  case token_type_t::MDELETE:
    return "MDELETE";
  case token_type_t::MSIZE:
    return "MSIZE";
  case token_type_t::NOT:
    return "NOT";
  case token_type_t::OR:
    return "OR";
  case token_type_t::AND:
    return "AND";
  case token_type_t::XOR:
    return "XOR";
  case token_type_t::EQ:
    return "EQ";
  case token_type_t::LT:
    return "LT";
  case token_type_t::LTE:
    return "LTE";
  case token_type_t::GT:
    return "GT";
  case token_type_t::GTE:
    return "GTE";
  case token_type_t::PLUS:
    return "PLUS";
  case token_type_t::SUB:
    return "SUB";
  case token_type_t::MULT:
    return "MULT";
  case token_type_t::PRINT:
    return "PRINT";
  case token_type_t::JUMP_ABS:
    return "JUMP_ABS";
  case token_type_t::JUMP_STACK:
    return "JUMP_STACK";
  case token_type_t::JUMP_IF:
    return "JUMP_IF";
  case token_type_t::CALL:
    return "CALL";
  case token_type_t::CALL_STACK:
    return "CALL_STACK";
  case token_type_t::RET:
    return "RET";
  case token_type_t::SYMBOL:
    return "SYMBOL";
  }
  return "";
 }
 std::ostream &operator<<(std::ostream &os, lerr_t &lerr)
 {
  os << lerr.line << ":" << lerr.col << ": ";
  switch (lerr.type)
  {
  case lerr_type_t::OK:
    os << "OK";
    break;
  case lerr_type_t::INVALID_CHAR_LITERAL:
    os << "INVALID_CHAR_LITERAL";
    break;
  case lerr_type_t::INVALID_CHAR_LITERAL_ESCAPE_SEQUENCE:
    os << "INVALID_CHAR_LITERAL_ESCAPE_SEQUENCE";
    break;
  case lerr_type_t::INVALID_STRING_LITERAL:
    os << "INVALID_STRING_LITERAL";
    break;
  case lerr_type_t::INVALID_NUMBER_LITERAL:
    os << "INVALID_NUMBER_LITERAL";
    break;
  case lerr_type_t::INVALID_PREPROCESSOR_DIRECTIVE:
    os << "INVALID_PREPROCESSOR_DIRECTIVE";
    break;
  case lerr_type_t::UNKNOWN_LEXEME:
    os << "UNKNOWN_LEXEME";
    break;
  default:
    break;
  }
  return os;
 }
 lerr_t::lerr_t(lerr_type_t type, size_t col, size_t line)
    : col{col}, line{line}, type{type}
 {}
--- a/asm/lexer.hpp
+++ b/asm/lexer.hpp
@@ -1,106 +0,0 @@
 /* Copyright (C) 2024 Aryadev Chavali
 * You may distribute and modify this code under the terms of the
 * GPLv2 license.  You should have received a copy of the GPLv2
 * license with this file.  If not, please write to:
 * aryadev@aryadevchavali.com.
 * Created: 2024-04-14
 * Author: Aryadev Chavali
 * Description: Lexer for assembly language
 */
 #ifndef LEXER_HPP
 #define LEXER_HPP
 #include <ostream>
 #include <string>
 #include <tuple>
 #include <vector>
 enum class token_type_t
 {
  PP_CONST,     // %const(<symbol>)...
  PP_USE,       // %use <string>
  PP_END,       // %end
  PP_REFERENCE, // $<symbol>
  GLOBAL,
  STAR,
  LITERAL_NUMBER,
  LITERAL_CHAR,
  LITERAL_STRING,
  NOOP,
  HALT,
  PUSH,
  POP,
  PUSH_REG,
  MOV,
  DUP,
  MALLOC,
  MALLOC_STACK,
  MSET,
  MSET_STACK,
  MGET,
  MGET_STACK,
  MDELETE,
  MSIZE,
  NOT,
  OR,
  AND,
  XOR,
  EQ,
  LT,
  LTE,
  GT,
  GTE,
  PLUS,
  SUB,
  MULT,
  PRINT,
  JUMP_ABS,
  JUMP_STACK,
  JUMP_IF,
  CALL,
  CALL_STACK,
  RET,
  SYMBOL,
 };
 const char *token_type_as_cstr(token_type_t type);
 struct token_t
 {
  token_type_t type;
  size_t column, line;
  std::string content;
  token_t();
  token_t(token_type_t, std::string, size_t col = 0, size_t line = 0);
 };
 std::ostream &operator<<(std::ostream &, token_t &);
 enum class lerr_type_t
 {
  OK = 0,
  INVALID_CHAR_LITERAL,
  INVALID_CHAR_LITERAL_ESCAPE_SEQUENCE,
  INVALID_STRING_LITERAL,
  INVALID_NUMBER_LITERAL,
  INVALID_PREPROCESSOR_DIRECTIVE,
  UNKNOWN_LEXEME,
 };
 struct lerr_t
 {
  size_t col, line;
  lerr_type_t type;
  lerr_t(lerr_type_t type = lerr_type_t::OK, size_t col = 0, size_t line = 0);
 };
 std::ostream &operator<<(std::ostream &, lerr_t &);
 lerr_t tokenise_buffer(std::string_view, std::vector<token_t *> &);
 #endif
--- a/asm/main.cpp
+++ b/asm/main.cpp
@@ -1,148 +0,0 @@
 /* Copyright (C) 2024 Aryadev Chavali
 * You may distribute and modify this code under the terms of the
 * GPLv2 license.  You should have received a copy of the GPLv2
 * license with this file.  If not, please write to:
 * aryadev@aryadevchavali.com.
 * Created: 2024-04-14
 * Author: Aryadev Chavali
 * Description: Entrypoint for assembly program
 */
 #include <algorithm>
 #include <cstdio>
 #include <iostream>
 #include <optional>
 #include <string>
 #include <tuple>
 #include <vector>
 extern "C"
 {
 #include <lib/inst.h>
 }
 #include "./base.hpp"
 #include "./lexer.hpp"
 #include "./preprocesser.hpp"
 using std::cout, std::cerr, std::endl;
 using std::pair, std::string, std::string_view, std::vector;
 void usage(const char *program_name, FILE *fp)
 {
  fprintf(fp,
          "Usage: %s FILE OUT-FILE\n"
          "\tFILE: Source code to compile\n"
          "\tOUT-FILE: Name of file to store bytecode\n",
          program_name);
 }
 int main(int argc, const char *argv[])
 {
  if (argc == 1 || argc > 3)
  {
    usage(argv[0], stderr);
    return -1;
  }
  int ret                 = 0;
  const char *source_name = argv[1];
  const char *out_name    = argv[2];
  (void)out_name;
 #if VERBOSE >= 1
  printf("[%sASSEMBLER%s]: Assembling `%s` to `%s`\n", TERM_YELLOW, TERM_RESET,
         source_name, out_name);
 #endif
  auto file_source = read_file(source_name);
 #if VERBOSE >= 1
  printf("[%sASSEMBLER%s]: `%s` -> %lu bytes\n", TERM_YELLOW, TERM_RESET,
         source_name, file_source.has_value() ? file_source.value().size() : 0);
 #endif
  string source_str;
  string_view original;
  string_view src;
  vector<token_t *> tokens, preprocessed_tokens;
  lerr_t lerr;
  pp_err_t pp_err;
  // Highest scoped variable cut off point
  if (file_source.has_value())
    source_str = file_source.value();
  else
  {
    cerr << "ERROR: file `" << source_name << "` does not exist!" << endl;
    ret = -1;
    goto end;
  }
  original = string_view{source_str};
  src      = string_view{source_str};
  lerr     = tokenise_buffer(src, tokens);
  if (lerr.type != lerr_type_t::OK)
  {
    cerr << source_name << ":" << lerr << endl;
    ret = 255 - static_cast<int>(lerr.type);
    goto end;
  }
  else
  {
 #if VERBOSE >= 1
    printf("[%sLEXER%s]: %lu bytes -> %lu tokens\n", TERM_GREEN, TERM_RESET,
           source_str.size(), tokens.size());
 #endif
 #if VERBOSE == 2
    printf("[%sLEXER%s]: Tokens "
           "parsed:\n----------------------------------------------------------"
           "----------------------\n",
           TERM_GREEN, TERM_RESET);
    for (auto token : tokens)
      cout << "\t" << *token << endl;
    printf("-------------------------------------------------------------"
           "-------------------\n");
 #endif
  }
  // preprocessing
  pp_err = preprocesser(tokens, preprocessed_tokens);
  if (pp_err.type != pp_err_type_t::OK)
  {
    cerr << source_name << ":" << pp_err.reference->line << ":"
         << pp_err.reference->column << ": " << pp_err << endl;
    ret = 255 - static_cast<int>(pp_err.type);
    goto end;
  }
  else
  {
 #if VERBOSE >= 1
    printf("[%sPREPROCESSOR%s]: %lu tokens -> %lu tokens\n", TERM_GREEN,
           TERM_RESET, tokens.size(), preprocessed_tokens.size());
 #endif
 #if VERBOSE == 2
    printf("[%sPREPROCESSOR%s]: Processed tokens: "
           "\n-----------------------------------------------------------------"
           "---------------\n",
           TERM_GREEN, TERM_RESET);
    for (auto token : preprocessed_tokens)
      cout << "\t" << *token << endl;
    printf("-------------------------------------------------------------"
           "-------------------\n");
 #endif
  }
 end:
  for (auto token : tokens)
    delete token;
  for (auto token : preprocessed_tokens)
    delete token;
  return ret;
 }
--- a/asm/preprocesser.cpp
+++ b/asm/preprocesser.cpp
@@ -1,218 +0,0 @@
 /* Copyright (C) 2024 Aryadev Chavali
 * You may distribute and modify this code under the terms of the
 * GPLv2 license.  You should have received a copy of the GPLv2
 * license with this file.  If not, please write to:
 * aryadev@aryadevchavali.com.
 * Created: 2024-04-14
 * Author: Aryadev Chavali
 * Description: Preprocessor which occurs after lexing before parsing.
 */
 #include "./preprocesser.hpp"
 #include "./base.hpp"
 #include <algorithm>
 #include <unordered_map>
 using std::pair, std::vector, std::make_pair, std::string, std::string_view;
 #define VCLEAR(V)                       \
  std::for_each((V).begin(), (V).end(), \
                [](token_t *t)          \
                {                       \
                  delete t;             \
                });
 pp_err_t preprocess_use_blocks(const vector<token_t *> &tokens,
                               vector<token_t *> &vec_out)
 {
  for (size_t i = 0; i < tokens.size(); ++i)
  {
    token_t *t = tokens[i];
    if (t->type == token_type_t::PP_USE)
    {
      if (i + 1 >= tokens.size() ||
          tokens[i + 1]->type != token_type_t::LITERAL_STRING)
      {
        VCLEAR(vec_out);
        vec_out.clear();
        return pp_err_t(pp_err_type_t::EXPECTED_STRING, t);
      }
      token_t *name = tokens[i + 1];
      auto source   = read_file(name->content.c_str());
      if (!source)
      {
        VCLEAR(vec_out);
        vec_out.clear();
        return pp_err_t(pp_err_type_t::FILE_NONEXISTENT, name);
      }
      std::vector<token_t *> ftokens;
      lerr_t lerr = tokenise_buffer(source.value(), ftokens);
      if (lerr.type != lerr_type_t::OK)
      {
        VCLEAR(vec_out);
        vec_out.clear();
        return pp_err_t(pp_err_type_t::FILE_PARSE_ERROR, name, lerr);
      }
      vec_out.insert(vec_out.end(), ftokens.begin(), ftokens.end());
      ++i;
    }
    else
      vec_out.push_back(new token_t{*t});
  }
  return pp_err_t();
 }
 struct const_t
 {
  size_t start, end;
 };
 pp_err_t preprocess_const_blocks(const vector<token_t *> &tokens,
                                 vector<token_t *> &vec_out)
 {
  std::unordered_map<string_view, const_t> blocks;
  for (size_t i = 0; i < tokens.size(); ++i)
  {
    token_t *t = tokens[i];
    if (t->type == token_type_t::PP_CONST)
    {
      string_view capture;
      if (i + 1 >= tokens.size() || tokens[i + 1]->type != token_type_t::SYMBOL)
        return pp_err_type_t::EXPECTED_NAME;
      capture = tokens[++i]->content;
      ++i;
      size_t block_start = i, block_end = 0;
      for (; i < tokens.size() && tokens[i]->type != token_type_t::PP_END; ++i)
        continue;
      if (i == tokens.size())
        return pp_err_t{pp_err_type_t::EXPECTED_END};
      block_end = i;
      blocks[capture] = const_t{block_start, block_end};
    }
  }
  if (blocks.size() == 0)
  {
    // Just construct a new vector and carry on
    for (token_t *token : tokens)
      vec_out.push_back(new token_t{*token});
  }
  else
  {
    for (size_t i = 0; i < tokens.size(); ++i)
    {
      token_t *token = tokens[i];
      // Skip the tokens that construct the const
      if (token->type == token_type_t::PP_CONST)
        for (; i < tokens.size() && tokens[i]->type != token_type_t::PP_END;
             ++i)
          continue;
      else if (token->type == token_type_t::PP_REFERENCE)
      {
        auto it = blocks.find(token->content);
        if (it == blocks.end())
        {
          VCLEAR(vec_out);
          vec_out.clear();
          return pp_err_t(pp_err_type_t::UNKNOWN_NAME, token);
        }
        const_t block = it->second;
        for (size_t i = block.start; i < block.end; ++i)
          vec_out.push_back(new token_t{*tokens[i]});
      }
      else
        vec_out.push_back(new token_t{*token});
    }
  }
  return pp_err_t();
 }
 pp_err_t preprocesser(const vector<token_t *> &tokens,
                      vector<token_t *> &vec_out)
 {
  vector<token_t *> use_block_tokens;
  pp_err_t pperr = preprocess_use_blocks(tokens, use_block_tokens);
  if (pperr.type != pp_err_type_t::OK)
  {
    vec_out = tokens;
    return pperr;
  }
  vector<token_t *> const_block_tokens;
  pperr = preprocess_const_blocks(use_block_tokens, const_block_tokens);
  if (pperr.type != pp_err_type_t::OK)
  {
    VCLEAR(tokens);
    vec_out = use_block_tokens;
    return pperr;
  }
  VCLEAR(use_block_tokens);
  vec_out = const_block_tokens;
  return pp_err_t{pp_err_type_t::OK};
 }
 // TODO: Implement this
 pp_err_t preprocess_macro_blocks(const vector<token_t *> &,
                                 vector<token_t *> &);
 std::ostream &operator<<(std::ostream &os, pp_err_t &err)
 {
  os << "PREPROCESSING_";
  switch (err.type)
  {
  case OK:
    return os << "OK";
  case EXPECTED_NAME:
    return os << "EXPECTED_NAME";
  case EXPECTED_STRING:
    return os << "EXPECTED_STRING";
  case EXPECTED_END:
    return os << "EXPECTED_END";
  case FILE_NONEXISTENT:
    return os << "FILE_NONEXISTENT";
  case FILE_PARSE_ERROR:
    return os << "FILE_PARSE_ERROR -> \n\t[" << err.reference->content
              << "]:" << err.lerr;
  case UNKNOWN_NAME:
    return os << "UNKNOWN_NAME";
  }
  return os;
 }
 pp_err_t::pp_err_t() : reference{nullptr}, type{pp_err_type_t::OK}, lerr{}
 {}
 pp_err_t::pp_err_t(pp_err_type_t e) : reference{nullptr}, type{e}, lerr{}
 {}
 pp_err_t::pp_err_t(pp_err_type_t err, const token_t *ref)
    : reference{ref}, type{err}
 {}
 pp_err_t::pp_err_t(pp_err_type_t err, const token_t *ref, lerr_t lerr)
    : reference{ref}, type{err}, lerr{lerr}
 {}
 // pp_unit_t::pp_unit_t(const token_t *const token) : resolved{false},
 // token{token}
 // {}
 // pp_unit_t::pp_unit_t(std::string_view name, std::vector<pp_unit_t> elements)
 //     : resolved{false}, token{nullptr}, container{name, elements}
 // {}
--- a/asm/preprocesser.hpp
+++ b/asm/preprocesser.hpp
@@ -1,62 +0,0 @@
 /* Copyright (C) 2024 Aryadev Chavali
 * You may distribute and modify this code under the terms of the GPLv2
 * license.  You should have received a copy of the GPLv2 license with
 * this file.  If not, please write to: aryadev@aryadevchavali.com.
 * Created: 2024-04-14
 * Author: Aryadev Chavali
 * Description: Preprocessor which occurs after lexing before parsing.
 */
 #ifndef PREPROCESSER_HPP
 #define PREPROCESSER_HPP
 #include <ostream>
 #include <tuple>
 #include "./lexer.hpp"
 enum pp_err_type_t
 {
  OK = 0,
  EXPECTED_NAME,
  EXPECTED_STRING,
  EXPECTED_END,
  FILE_NONEXISTENT,
  FILE_PARSE_ERROR,
  UNKNOWN_NAME,
 };
 struct pp_err_t
 {
  const token_t *reference;
  pp_err_type_t type;
  lerr_t lerr;
  pp_err_t();
  pp_err_t(pp_err_type_t);
  pp_err_t(pp_err_type_t, const token_t *);
  pp_err_t(pp_err_type_t, const token_t *, lerr_t);
 };
 std::ostream &operator<<(std::ostream &, pp_err_t &);
 struct pp_unit_t
 {
  const token_t *const token;
  struct
  {
    std::string_view name;
    std::vector<pp_unit_t> elements;
  } container;
  pp_unit_t(const token_t *const);
  pp_unit_t(std::string_view, std::vector<pp_unit_t>);
 };
 std::vector<pp_unit_t> tokens_to_units(const std::vector<token_t *> &);
 pp_err_t preprocess_use(std::vector<pp_unit_t> &);
 pp_err_t preprocesser(const std::vector<token_t *> &, std::vector<token_t *> &);
 #endif
--- a/examples/factorial.asm
+++ b/examples/factorial.asm
@@ -1,54 +0,0 @@
 ;;; factorial.asm: A program that generates the factorials of each
 ;;;  number from 1 to 20.  Using the registers to store `n` and `n!`.
  ;; Constants
  ;; Choice of 20 was not arbitrary; log(20!) ~= 61 while log(21!) ~=
  ;; 65 which means that past 20! results are truncated and therefore
  ;; the program produces inaccurate factorials.
  %const limit 20 %end
  ;; Setup entrypoint
  global main
 main:
  ;; $I -> W[0] = 1, $J -> W[1] = 1
  push.word 1
  mov.word 0
  push.word 1
  mov.word 1
  ;; Print `$I: $J`
 loopback:
  push.byte '\t'
  print.char
  push.reg.word 0
  print.word
  push.byte ':'
  print.char
  push.byte ' '
  print.char
  push.reg.word 1
  print.word
  push.byte '\n'
  print.char
  ;;  $I += 1
  push.reg.word 0
  push.word 1
  plus.word
  mov.word 0
  ;;  $J *= $I
  push.reg.word 0
  push.reg.word 1
  mult.word
  mov.word 1
  ;; IF $I >= $LIMIT ...
  push.word $limit
  push.reg.word 0
  gte.word
  ;; THEN jump to `loopback`
  jump.if.byte loopback
  ;; ELSE halt
  halt
--- a/examples/fib.asm
+++ b/examples/fib.asm
@@ -1,92 +0,0 @@
 ;;; fib.asm: A program that generates the fibonacci numbers up to a
 ;;; very large bound (~UINT64_MAX).  Using the registers to store the
 ;;; pairs of fibonacci numbers, we ensure only a finite amount of
 ;;; memory is necessary for this program to function, unlike a pure
 ;;; stack version.
  ;; Constants
  %const limit 93 %end
  %const increment_i
  push.reg.word 2
  push.word 1
  plus.word
  mov.word 2
  %end
  %const print_i
  push.reg.word 2
  print.word
  %end
  %const print_reg_0
  push.reg.word 0
  print.word
  %end
  %const print_reg_1
  push.reg.word 1
  print.word
  %end
  ;; Setup entrypoint
  global main
 main:
  ;; Setup iterator I
  push.word 1
  mov.word 2
  ;; Setup initial A -> W[0] = 1 and B -> W[1] = 1
  push.word 1
  mov.word 0
  push.word 1
  mov.word 1
  ;; Print "$I: $A" and "($I + 1): $B"
 loopback:
  call print_pair
  ;;  $A += $B
  push.reg.word 0
  push.reg.word 1
  plus.word
  mov.word 0
  ;;  $B += $A
  push.reg.word 0
  push.reg.word 1
  plus.word
  mov.word 1
  ;; IF $I < $LIMIT ...
  push.reg.word 2
  push.word $limit
  lt.word
  ;; THEN jump to `loopback`
  jump.if.byte loopback
  ;; ELSE halt
  halt
 print_pair:
  push.byte '\t'
  print.char
  $print_i
  push.byte ':'
  print.char
  push.byte ' '
  print.char
  $print_reg_0
  push.byte '\n'
  print.char
  $increment_i
  push.byte '\t'
  print.char
  $print_i
  push.byte ':'
  print.char
  push.byte ' '
  print.char
  $print_reg_1
  push.byte '\n'
  print.char
  $increment_i
  ret
--- a/examples/instruction-test.asm
+++ b/examples/instruction-test.asm
@@ -1,95 +0,0 @@
 ;;; instruction-test.asm: A file that contains all possible opcodes in
 ;;; order, with proper calling convention.  Used to test lexer and
 ;;; parser but isn't a semantically correct program, but may be run as
 ;;; first instruction is halt (so program will stop immediately).
  ;; setup entrypoint
  global main
 main:
  halt
  push.byte 1
  push.hword 2
  push.word 3
  pop.byte
  pop.hword
  pop.word
  push.reg.byte 1
  push.reg.hword 2
  push.reg.word 3
  mov.byte 1
  mov.hword 2
  mov.word 3
  dup.byte 1
  dup.hword 2
  dup.word 3
  malloc.byte 1
  malloc.hword 2
  malloc.word 3
  malloc.stack.byte
  malloc.stack.hword
  malloc.stack.word
  mset.byte 1
  mset.hword 2
  mset.word 3
  mset.stack.byte
  mset.stack.hword
  mset.stack.word
  mget.byte 1
  mget.hword 2
  mget.word 3
  mget.stack.byte
  mget.stack.hword
  mget.stack.word
  not.byte
  not.hword
  not.word
  or.byte
  or.hword
  or.word
  and.byte
  and.hword
  and.word
  xor.byte
  xor.hword
  xor.word
  eq.byte
  eq.hword
  eq.word
  plus.byte
  plus.hword
  plus.word
  sub.byte
  sub.hword
  sub.word
  print.char
  print.byte
  print.int
  print.hword
  print.long
  print.word
  jump.abs 1
  jump.stack
  jump.if.byte 1
  jump.if.hword 2
  jump.if.word 3
  ;; Testing if overflows work correctly
  ;; Format is:
  ;;     -1        All bits are turned on
  ;;     UINT_MAX  All bits are turned on
  ;;     INT_MAX   All bits but the most significant are on
  ;;     INT_MIN   Only the most significant bit is on
  push.byte -1
  push.byte 255
  push.byte 127
  push.byte -128
  push.hword -1
  push.hword 4294967295
  push.hword 2147483647
  push.hword -2147483648
  push.word -1
  push.word 18446744073709551615
  push.word 9223372036854775807
  push.word -9223372036854775808
--- a/examples/memory-print.asm
+++ b/examples/memory-print.asm
@@ -1,65 +0,0 @@
 ;;; memory-print: An example program that features a subroutine for
 ;;; printing a memory buffer, of any length, as characters.
  ;; Setup label for entrypoint
  global main
 main:
  ;; Allocate a buffer of 3 characters
  malloc.byte 3
  mov.word 0
  ;; Setup the buffer to be equivalent to "abc"
  push.reg.word 0
  push.byte 'a'
  mset.byte 0
  push.reg.word 0
  push.byte 'b'
  mset.byte 1
  push.reg.word 0
  push.byte 'c'
  mset.byte 2
  ;; Save buffer to W[8] because the first 8 registers should be
  ;; reserved for library routines as it may be overwritten
  push.reg.word 0
  mov.word 8
  ;; Call the routine
  call print_cptr
  ;; Delete allocated buffer
  push.reg.word 8
  mdelete
  halt
 ;;; print_cptr: Prints pointer to a buffer of characters.  Pointer
 ;;; should be on the stack as a word.
 print_cptr:
  ;; iterator I -> W[1]
  push.word 0
  mov.word 1
  ;; (W[0])[W[1]] -> P[I]
 loopback:
  push.reg.word 0
  push.reg.word 1
  mget.stack.byte
  print.char
  ;; I += 1
  push.reg.word 1
  push.word 1
  plus.word
  mov.word 1
  ;; if I != |P| ...
  push.reg.word 1
  push.reg.word 0
  msize
  eq.word
  not.byte
  ;; then go to `loopback`
  jump.if.byte loopback
  ;; else print a newline
  push.byte '\n'
  print.char
  ;; return back to the caller
  ret
--- a/todo.org
+++ b/todo.org
@@ -10,200 +10,13 @@
 **** DONE lib/darr.h
 **** TODO lib/heap.h
 **** TODO lib/inst.h
 *** TODO ASM [0%]
 **** TODO asm/lexer.h
 **** TODO asm/parser.h
 *** TODO VM [0%]
 **** TODO vm/runtime.h
 ** TODO Specification
 * TODO Preprocessing directives :ASM:
 Like in FASM or NASM where we can give certain helpful instructions to
 the assembler.  I'd use the ~%~ symbol to designate preprocessor
 directives.
 ** TODO Macros
 Essentially constants expressions which take literal parameters
 (i.e. tokens) and can use them throughout the body.  Something like
 #+begin_src asm
 %macro(name)(param1 param2 param3)
 ...
 %end
 #+end_src
 Where each parameter is substituted in a call at preprocessing time.
 A call should look something like this:
 #+begin_src asm
  $name 1 2 3
 #+end_src
 and those tokens will be substituted literally in the macro body.
 * WIP Write assembler in a different language :ASM:
 While the runtime and base library needs to deal with only
 binary, the assembler has to deal with string inputs and a larger
 variety of bugs.  As the base library is written in C, and is all that
 is necessary to write a program that targets the virtual machine, we
 could realistically use another language to write the assembler in via
 FFI with minimal pain.
 Languages in the competition:
 + C++
 + Rust
 + Python
 2024-04-14: Chose C++ cos it will require the least effort to rewrite
 the currently existing codebase while still leveraging some less
 efficient but incredibly useful features.
 * TODO Rewrite preprocesser to create a custom unit instead of token streams
 ** Problem
 A problem that occurs in the preprocessor is token column and line
 count.  Say =a.asm= has ~%use "b.asm"~.  The tokens from the =b.asm=
 file are inserted into =a.asm='s token stream, but the line/column
 count from there isn't properly set in =a.asm=.
 A naive solution would be to just recount the lines and columns, but
 this removes information about where those tokens came from.  Say an
 error occurs in some of =b.asm='s code: I would like to be able to
 report them.
 Therefore, we can no longer just generate new token streams from the
 preprocesser and should instead look at making more complex
 abstractions.
 A problem this could also solve is nested errors and recursive
 constants.  Say I have some assembly like so
 #+begin_src asm
  %const limit 20 %end
  %const print-limit
  ...
  push.byte $limit
  print.byte
  ...
  %end
 #+end_src
 A call to ~print-limit~ under the current system would insert the
 tokens for print-limit but completely forget about ~push.byte $limit~
 which would cause a parsing error.  (This could be fixed under the
 current system by allowing reference resolution inside of const
 blocks, with the conceit that it would be hard to stop infinite recursion)
 ** Language model
 The model I have in mind is that all constructs in this meta language
 (the preprocessing language) are either singular tokens or collections
 of tokens/constructs in a recursive sense.  This naturally follows
 from the fact that a single pass isn't enough to properly parse this
 language: there must be some recursive nature which forces the
 language to take multiple passes to completely generate a stream that
 can be parsed.
 This vague notion can be formalised like so.  A preprocessing unit is
 either a singular token or a named collection of units.  The former
 represents your standard symbols and literals while the later
 represents ~%const~ and ~%use~ calls where there is a clear name
 associated to a collection of one or more tokens (in the case of the
 former it's the constant's name and the latter it's the filename).
 We'll distinguish this as well.
 #+begin_src text
 Token = PP_USE | PP_CONST | String(Content) | Symbol(Content) | PUSH(Content) | ...
 Type = File(String) | Constant(Symbol)
 Unit = Token      | Container(Type . Vector[Unit])
 #+end_src
 Through this model our initial stream of tokens can be considered
 units.  We can already see that this model may solve our original
 problem: with named containers it doesn't matter that certain tokens
 are from different parts of the file or different files as they are
 distinctly typed from the general set of tokens, with a name which
 states where they're from.
 ** Processing
 We need this model to have a notion of "processing" though, otherwise
 it's quite useless.  A processing function is simply a function which
 takes a unit and returns another unit.  We currently have two
 processing functions we can consider: ~process_const~ and
 ~process_use~.
 ~process_use~ takes a vector of tokens and, upon encountering PP_USE
 accepts the next token (a string) and tokenises the file
 with that name.  Within our model we'd make the stream of tokens
 created from opening the file a /container/.
 ~process_const~ takes a vector of tokens and does two things in an
 iteration:
 1) upon encountering PP_CONST accepts the next n tokens till PP_END is
   encountered, with the first token being a symbol.  This is
   registered in a map of constants (~CONSTS~) where the symbol is the
   key and the value associated is the n - 1 tokens accepted
 2) upon encountering a PP_REFERENCE reads the content associated with
   it (considered a symbol ~S~) and replaces it ~CONSTS[S]~ (if S is
   in CONSTS).
 One thing to note is that both of these definitions are easily
 extensible to the general definition of units: if a unit is a
 container of some kind we can recur through its vector of units to
 resolve any further "calls".  For ~process_const~ it's ~%const~ or
 ~$ref~ while for ~process_use~ it's ~%use~.
 ** History/versioning
 One additional facet to this model I'd like to add is "history".  Each
 unit is actually a list (or a singly linked tree where each parent has
 at most one child) of sub-units where the top of the list represents
 the current version.  Each descendant is a previous version of the
 token.
 Say I do some processing on an element of the unit list =a= (with
 index =i=) such that it becomes a new "unit", call it =b=.  Then we
 update V by =V[i] = cons(b, a)=.  Through this, the lists acts as a
 history of processing that has occurred on the unit.  This provides an
 ability to trace the path of preprocessing to an eventual conclusion.
 Processing occurs on a unit until it cannot be done further i.e. when
 there are no more "calls" in the tree to resolve.  The history list
 provides all the versions of a unit till its resolved form.
 To see what a unit with history may look like (where symbols are
 terminals i.e. completely resolved):
 + Container('limit' . [a Container("b" . d e f) c])
  + Container('limit' . [a '$b' c])
    + Token(PP_REF('$limit'))
 This shows resolution of the unit reference ~$limit~, which in turn
 leads to the resolution of ~$b~ which is a sub-unit.
 There are two ways of indefinite resolution, one per method of
 processing.  For ~process_use~ it is two files calling ~%use~ on each
 other and for ~process_const~ it is a ~%const~ calling itself.  We can
 just disallow it through analysis.
 ** Pseudocode
 #+begin_src text
 process_use(V: Vector[Unit]) ->
    [cons((if v is Token(PP_USE) and next(v) is Token(String(S))
             -> Container(File(S) . tokenise(open(v')))
           else if v is Container(name . units)
             -> Container(name . process_use(units))
           else
             -> v),
          v_x)
     v = v_x[0]
     for v_x in V]
 CONSTS={}
 process_const(V: Vector[Unit]) ->
    [cons((if v is Token(PP_CONST) and next(v) is Token(Symbol(S))
                do {
                    i := find(Token(PP_END), V[v:])
                    CONSTS[S] = V[next(v):prev(i)]
                    -> Container(Constant(S) . CONSTS[S])
                }
           else if v is Token(PP_REF(S))
                -> CONSTS[S]
           else if v is Container(name . units)
               -> Container(name . process_const(units))
           else
               -> v)
          v_x)
     v = v_x[0]
     for v_x in V]
 #+end_src
 * TODO Introduce error handling in base library :LIB:
 There is a large variety of TODOs about errors.  Let's fix them!
 8 TODOs currently present.
-* TODO Standard library :ASM:VM:
+* TODO Standard library :VM:
 I should start considering this and how a user may use it.  Should it
 be an option in the VM and/or assembler binaries (i.e. a flag) or
 something the user has to specify in their source files?