diff --git a/.dir-locals.el b/.dir-locals.el index 1e8b181..2bbc275 100644 --- a/.dir-locals.el +++ b/.dir-locals.el @@ -7,8 +7,4 @@ (c-mode . ((flycheck-gcc-include-path . ("..")) (flycheck-clang-include-path . ("..")) (company-clang-arguments . ("-I..")) - (eval . (clang-format-mode t)))) - (c++-mode . ((flycheck-gcc-include-path . ("..")) - (flycheck-clang-include-path . ("..")) - (company-clang-arguments . ("-I..")) - (eval . (clang-format-mode t))))) + (eval . (clang-format-mode t))))) diff --git a/Makefile b/Makefile index de1dcd8..9469826 100644 --- a/Makefile +++ b/Makefile @@ -1,12 +1,10 @@ CC=gcc -CPP=g++ - VERBOSE=0 -GENERAL-FLAGS=-Wall -Wextra -Werror -Wswitch-enum -I$(shell pwd) +GENERAL-FLAGS:=-Wall -Wextra -Werror -Wswitch-enum -I$(shell pwd) DEBUG-FLAGS=-ggdb -fsanitize=address RELEASE-FLAGS=-O3 + CFLAGS:=$(GENERAL-FLAGS) -std=c11 $(DEBUG-FLAGS) -D VERBOSE=$(VERBOSE) -CPPFLAGS:=$(GENERAL-FLAGS) -std=c++17 $(DEBUG-FLAGS) -D VERBOSE=$(VERBOSE) LIBS=-lm DIST=build @@ -20,102 +18,50 @@ LIB_DIST=$(DIST)/lib LIB_SRC=lib LIB_CODE:=$(addprefix $(LIB_SRC)/, base.c darr.c heap.c inst.c) LIB_OBJECTS:=$(LIB_CODE:$(LIB_SRC)/%.c=$(LIB_DIST)/%.o) -LIB_CFLAGS=$(CFLAGS) ## VM setup VM_DIST=$(DIST)/vm VM_SRC=vm VM_CODE:=$(addprefix $(VM_SRC)/, runtime.c) VM_OBJECTS:=$(VM_CODE:$(VM_SRC)/%.c=$(VM_DIST)/%.o) -VM_CFLAGS:=$(CFLAGS) -VM_OUT=$(DIST)/ovm.out - -## ASSEMBLY setup -ASM_DIST=$(DIST)/asm -ASM_SRC=asm -ASM_CODE:=$(addprefix $(ASM_SRC)/, base.cpp lexer.cpp preprocesser.cpp) -ASM_OBJECTS:=$(ASM_CODE:$(ASM_SRC)/%.cpp=$(ASM_DIST)/%.o) -ASM_CFLAGS=$(CPPFLAGS) -ASM_OUT=$(DIST)/asm.out - -## EXAMPLES setup -EXAMPLES_DIST=$(DIST)/examples -EXAMPLES_SRC=examples -EXAMPLES=$(EXAMPLES_DIST)/instruction-test.out $(EXAMPLES_DIST)/fib.out $(EXAMPLES_DIST)/factorial.out $(EXAMPLES_DIST)/memory-print.out +VM_OUT=$(DIST)/avm.out ## Dependencies DEPDIR:=$(DIST)/dependencies DEPFLAGS = -MT $@ -MMD -MP -MF -DEPS:=$($(LIB_SRC):%.c=$(DEPDIR):%.o) $($(ASM_SRC):%.c=$(DEPDIR):%.o) $($(VM_SRC):%.c=$(DEPDIR):%.o) +DEPS:=$($(LIB_SRC):%.c=$(DEPDIR):%.o) $($(VM_SRC):%.c=$(DEPDIR):%.o) # Things you want to build on `make` -all: $(DIST) lib vm asm examples +all: $(DIST) lib vm lib: $(LIB_OBJECTS) vm: $(VM_OUT) -asm: $(ASM_OUT) -examples: $(EXAMPLES) # Recipes -## LIB Recipes - $(LIB_DIST)/%.o: $(LIB_SRC)/%.c | $(LIB_DIST) $(DEPDIR)/lib - @$(CC) $(LIB_CFLAGS) $(DEPFLAGS) $(DEPDIR)/lib/$*.d -c $< -o $@ $(LIBS) + @$(CC) $(CFLAGS) $(DEPFLAGS) $(DEPDIR)/lib/$*.d -c $< -o $@ $(LIBS) @echo "$(TERM_YELLOW)$@$(TERM_RESET): $<" -## VM Recipes $(VM_OUT): $(LIB_OBJECTS) $(VM_OBJECTS) $(VM_DIST)/main.o - @$(CC) $(VM_CFLAGS) $^ -o $@ $(LIBS) + @$(CC) $(CFLAGS) $^ -o $@ $(LIBS) @echo "$(TERM_GREEN)$@$(TERM_RESET): $^" $(VM_DIST)/%.o: $(VM_SRC)/%.c | $(VM_DIST) $(DEPDIR)/vm - @$(CC) $(VM_CFLAGS) $(DEPFLAGS) $(DEPDIR)/vm/$*.d -c $< -o $@ $(LIBS) + @$(CC) $(CFLAGS) $(DEPFLAGS) $(DEPDIR)/vm/$*.d -c $< -o $@ $(LIBS) @echo "$(TERM_YELLOW)$@$(TERM_RESET): $<" -## ASSEMBLY Recipes -$(ASM_OUT): $(LIB_OBJECTS) $(ASM_OBJECTS) $(ASM_DIST)/main.o - @$(CPP) $(ASM_CFLAGS) $^ -o $@ $(LIBS) - @echo "$(TERM_GREEN)$@$(TERM_RESET): $^" - -$(ASM_DIST)/%.o: $(ASM_SRC)/%.cpp | $(ASM_DIST) $(DEPDIR)/asm - @$(CPP) $(ASM_CFLAGS) $(DEPFLAGS) $(DEPDIR)/asm/$*.d -c $< -o $@ $(LIBS) - @echo "$(TERM_YELLOW)$@$(TERM_RESET): $<" - -## EXAMPLES recipes -$(EXAMPLES_DIST)/%.out: $(EXAMPLES_SRC)/%.asm $(ASM_OUT) | $(EXAMPLES_DIST) - @$(ASM_OUT) $< $@ - @echo "$(TERM_GREEN)$@$(TERM_RESET): $<" - -.PHONY: run-examples -run-examples: $(EXAMPLES) - @$(foreach example,$(EXAMPLES), echo "$(TERM_YELLOW)$(example)$(TERM_RESET)"; $(MAKE) -s interpret BYTECODE=$(example);) - -OUT= -ARGS= - .PHONY: run -run: $(DIST)/$(OUT) +run: $(DIST)/$(VM_OUT) ./$^ $(ARGS) .PHONY: clean clean: rm -rfv $(DIST)/* -SOURCE= -BYTECODE= -.PHONY: assemble -assemble: $(ASM_OUT) - @$(ASM_OUT) $(SOURCE) $(BYTECODE) - .PHONY: interpret interpret: $(VM_OUT) @$(VM_OUT) $(BYTECODE) -.PHONY: exec -exec: $(ASM_OUT) $(VM_OUT) - @$(ASM_OUT) $(SOURCE) $(BYTECODE) - @$(VM_OUT) $(BYTECODE) - # Directories $(DIST): mkdir -p $@ @@ -126,18 +72,9 @@ $(LIB_DIST): $(VM_DIST): mkdir -p $@ -$(ASM_DIST): - mkdir -p $@ - -$(EXAMPLES_DIST): - mkdir -p $@ - $(DEPDIR)/lib: mkdir -p $@ -$(DEPDIR)/asm: - mkdir -p $@ - $(DEPDIR)/vm: mkdir -p $@ diff --git a/README.org b/README.org index 68642f7..1f6b053 100644 --- a/README.org +++ b/README.org @@ -1,4 +1,4 @@ -#+title: Oreo's Virtual Machine (OVM) +#+title: Aryadev's Virtual Machine (AVM) #+author: Aryadev Chavali #+date: 2023-10-15 @@ -6,18 +6,14 @@ A stack based virtual machine in C11, with a dynamic register setup which acts as variable space. Deals primarily in bytes, doesn't make assertions about typing and is very simple to target. -2024-04-16: Project will now be split into two components -1) The runtime + base library -2) The assembler +This repository contains both a library ([[file:lib/][lib folder]]) to +(de)serialize bytecode and a program ([[file:vm/][vm folder]]) to +execute bytecode. -This will focus each repository on separate issues and make it easier -to organize. They will both derive from the same repositories -i.e. I'm not making fresh repositories and just sticking the folders -in but rather branching this repository into two different versions. +Along with this is an +[[https://github.com/aryadev-software/aal][assembler]] program which +can compile an assembly-like language to bytecode. -The two versions will be hosted at: -1) [[https://github.com/aryadev-software/avm]] -1) [[https://github.com/aryadev-software/aal]] * How to build Requires =GNU make= and a compliant C11 compiler. Code base has been tested against =gcc= and =clang=, but given how the project has been @@ -26,85 +22,70 @@ issue to compile using something like =tcc= or another compiler (look at [[file:Makefile::CC=gcc][here]] to change the compiler). To build everything simply run ~make~. This will build: -+ [[file:lib/inst.c][instruction bytecode system]] which provides - object files to target the VM -+ [[file:vm/main.c][VM executable]] which executes bytecode -+ [[file:asm/main.c][Assembler executable]] which assembles compliant - assembly code to VM bytecode -+ [[file:examples/][Assembly examples]] which provide some source code - examples on common programs one may write. Use this to figure out - how to write compliant assembly. Also a good test of both the VM - and assembler. ++ [[file:lib/][instruction bytecode system]] which provides object + files to target the VM ++ [[file:vm/][VM executable]] which executes bytecode You may also build each component individually through the corresponding recipe: + ~make lib~ + ~make vm~ -+ ~make asm~ -+ ~make examples~ -* Instructions to target the virtual machine -You need to link with the object files for -[[file:lib/base.c][base.c]], [[file:lib/darr.c][darr.c]] and -[[file:lib/inst.c][inst.c]] to be able to properly target the OVM. -The basic idea is to create some instructions via ~inst_t~, -instantiating a ~prog_t~ structure which wraps those instructions -(includes a header and other useful things for the runtime), then -using ~prog_write_file~ to serialise and write bytecode to a file -pointer. +* How to target the virtual machine +Link with the object files for [[file:lib/base.c][base.c]] and +[[file:lib/inst.c][inst.c]] to be able to properly target the virtual +machine. The general idea is to convert parse units into instances of +~inst_t~. Once a collection of ~inst_t~'s have been made, they must +be wrapped in a ~prog_t~ structure which is a flexibly allocated +structure with two components: +1) A program header ~prog_header_t~ with some essential properties of + the program (start address, count, etc) +2) A buffer of type ~inst_t~ which should contain the ordered + collection constructed -To execute directly compiled bytecode use the ~ovm.out~ executable on -the bytecode file. +There are two ways to utilise execute this program structure: +compilation or in memory execution. +** Compilation +The ~prog_t~ structure can be fed to ~prog_write_file~ with a file +pointer to write well formed =AVM= bytecode into a file. To execute +this bytecode, simply use the ~avm.out~ executable with the bytecode +file name. -For clarity, one may build ~lib~ (~make lib~) then use the resulting -object files to link and create bytecode for the virtual machine. +This is the classical way I expect languages to target the virtual +machine. ** In memory virtual machine -Instead of serialising and writing bytecode to a file, one may instead -serialise bytecode in memory using ~prog_write_bytecode~ which writes -bytecode to a dynamic byte buffer, so called *in memory compilation*. -To execute this bytecode, deserialise the bytecode into a program then -load it into a complete ~vm_t~ structure (linking with -[[file:vm/runtime.c][runtime.c]]). +This method requires linking with [[file:vm/runtime.c]] to be able to +construct a working ~vm_t~ structure. The steps are: ++ Load the stack, heap and call stack into a ~vm_t~ structure ++ Load the ~prog_t~ into the ~vm_t~ (~vm_load_program~) ++ Execute via ~vm_execute~ or ~vm_execute_all~ -In fact, you may skip the process of serialising entirely. You can -emit a ~prog_t~ structure corresponding to source code, load it -directly into the ~vm_t~ structure, then execute. To do so is a bit -involved, so I recommend looking at [[file:vm/main.c]]. In rough -steps: -+ Create a virtual machine "from scratch" (load the necessary - components (the stack, heap and call stack) by hand) -+ Load program into VM (~vm_load_program~) -+ Run ~vm_execute_all~ +~vm_execute~ executes the next instruction and stops, while +~vm_execute_all~ continues execution till the program halts. Either +can be useful depending on requirements. -This is recommended if writing an interpreted language such as a Lisp, -where on demand execution of code is more suitable. +I expect this method to be used for languages that are /interpreted/ +such as Lisp or Python where /code/ -> /execution/ rather than /code/ +-> /compile unit/ -> /execute unit/, while still providing the ability +to compile code to a byte code unit. * Lines of code #+begin_src sh :results table :exports results wc -lwc $(find -regex ".*\.[ch]\(pp\)?") #+end_src #+RESULTS: -| Files | Lines | Words | Bytes | -|------------------------+-------+-------+--------| -| ./lib/heap.h | 42 | 111 | 801 | -| ./lib/inst.c | 516 | 1315 | 13982 | -| ./lib/darr.c | 77 | 225 | 1757 | -| ./lib/base.c | 107 | 306 | 2002 | -| ./lib/inst.h | 108 | 426 | 4067 | -| ./lib/prog.h | 176 | 247 | 2616 | -| ./lib/base.h | 148 | 626 | 3915 | -| ./lib/darr.h | 88 | 465 | 2697 | -| ./lib/heap.c | 101 | 270 | 1910 | -| ./vm/runtime.h | 301 | 780 | 7965 | -| ./vm/runtime.c | 1070 | 3097 | 30010 | -| ./vm/main.c | 92 | 265 | 2243 | -| ./asm/base.hpp | 21 | 68 | 472 | -| ./asm/lexer.cpp | 565 | 1448 | 14067 | -| ./asm/base.cpp | 33 | 89 | 705 | -| ./asm/parser.hpp | 82 | 199 | 1656 | -| ./asm/parser.cpp | 42 | 129 | 1294 | -| ./asm/lexer.hpp | 106 | 204 | 1757 | -| ./asm/preprocesser.cpp | 218 | 574 | 5800 | -| ./asm/preprocesser.hpp | 62 | 147 | 1360 | -| ./asm/main.cpp | 148 | 414 | 3791 | -|------------------------+-------+-------+--------| -| total | 4103 | 11405 | 104867 | +| Files | Lines | Words | Bytes | +|----------------+-------+-------+-------| +| ./lib/heap.h | 42 | 111 | 801 | +| ./lib/inst.c | 512 | 1303 | 13936 | +| ./lib/darr.c | 77 | 225 | 1757 | +| ./lib/base.c | 107 | 306 | 2002 | +| ./lib/inst.h | 108 | 426 | 4067 | +| ./lib/prog.h | 176 | 247 | 2616 | +| ./lib/base.h | 148 | 626 | 3915 | +| ./lib/darr.h | 88 | 465 | 2697 | +| ./lib/heap.c | 101 | 270 | 1910 | +| ./vm/runtime.h | 301 | 780 | 7965 | +| ./vm/runtime.c | 1070 | 3097 | 30010 | +| ./vm/main.c | 92 | 265 | 2243 | +|----------------+-------+-------+-------| +| total | 2822 | 8121 | 73919 | diff --git a/asm/base.cpp b/asm/base.cpp deleted file mode 100644 index 10cfd3d..0000000 --- a/asm/base.cpp +++ /dev/null @@ -1,33 +0,0 @@ -/* Copyright (C) 2024 Aryadev Chavali - - * You may distribute and modify this code under the terms of the - * GPLv2 license. You should have received a copy of the GPLv2 - * license with this file. If not, please write to: - * aryadev@aryadevchavali.com. - - * Created: 2024-04-14 - * Author: Aryadev Chavali - * Description: - */ - -#include "./base.hpp" - -#include - -std::optional read_file(const char *filename) -{ - FILE *fp = fopen(filename, "rb"); - if (fp) - { - std::string contents; - fseek(fp, 0, SEEK_END); - contents.resize(ftell(fp)); - rewind(fp); - fread(&contents[0], 1, contents.size(), fp); - fclose(fp); - - return contents; - } - else - return std::nullopt; -} diff --git a/asm/base.hpp b/asm/base.hpp deleted file mode 100644 index f55e163..0000000 --- a/asm/base.hpp +++ /dev/null @@ -1,21 +0,0 @@ -/* Copyright (C) 2024 Aryadev Chavali - - * You may distribute and modify this code under the terms of the - * GPLv2 license. You should have received a copy of the GPLv2 - * license with this file. If not, please write to: - * aryadev@aryadevchavali.com. - - * Created: 2024-04-14 - * Author: Aryadev Chavali - * Description: Base library - */ - -#ifndef BASE_HPP -#define BASE_HPP - -#include -#include - -std::optional read_file(const char *); - -#endif diff --git a/asm/lexer.cpp b/asm/lexer.cpp deleted file mode 100644 index 2cb03ad..0000000 --- a/asm/lexer.cpp +++ /dev/null @@ -1,565 +0,0 @@ -/* Copyright (C) 2024 Aryadev Chavali - - * You may distribute and modify this code under the terms of the - * GPLv2 license. You should have received a copy of the GPLv2 - * license with this file. If not, please write to: - * aryadev@aryadevchavali.com. - - * Created: 2024-04-14 - * Author: Aryadev Chavali - * Description: Lexer for assembly language - */ - -extern "C" -{ -#include -} - -#include -#include - -#include "./lexer.hpp" - -static_assert(NUMBER_OF_OPCODES == 98, "ERROR: Lexer is out of date"); - -using std::string, std::string_view, std::pair, std::make_pair; - -const auto VALID_SYMBOL = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUV" - "WXYZ0123456789-_.:%#$", - VALID_DIGIT = "0123456789", VALID_HEX = "0123456789abcdefABCDEF"; - -bool is_char_in_s(char c, const char *s) -{ - return string_view(s).find(c) != string::npos; -} - -bool initial_match(string_view src, string_view match) -{ - return (src.size() > match.size() && src.substr(0, match.size()) == match); -} - -pair tokenise_symbol(string_view &source, size_t &column, - size_t line) -{ - auto end = source.find_first_not_of(VALID_SYMBOL); - if (end == string::npos) - end = source.size() - 1; - string sym{source.substr(0, end)}; - source.remove_prefix(end); - std::transform(sym.begin(), sym.end(), sym.begin(), ::toupper); - - token_t t{}; - - if (sym == "%CONST") - { - t.type = token_type_t::PP_CONST; - } - else if (sym == "%USE") - { - t.type = token_type_t::PP_USE; - } - else if (sym == "%END") - { - t.type = token_type_t::PP_END; - } - else if (sym[0] == '%') - { - return make_pair( - t, lerr_t(lerr_type_t::INVALID_PREPROCESSOR_DIRECTIVE, column, line)); - } - else if (sym.size() > 1 && sym[0] == '$') - { - t = token_t(token_type_t::PP_REFERENCE, sym.substr(1)); - } - else if (sym == "NOOP") - { - t.type = token_type_t::NOOP; - } - else if (sym == "HALT") - { - t.type = token_type_t::HALT; - } - else if (initial_match(sym, "PUSH.REG.")) - { - t = token_t(token_type_t::PUSH_REG, sym.substr(9)); - } - else if (initial_match(sym, "PUSH.")) - { - t = token_t(token_type_t::PUSH, sym.substr(5)); - } - else if (initial_match(sym, "POP.")) - { - t = token_t(token_type_t::POP, sym.substr(4)); - } - else if (initial_match(sym, "MOV.")) - { - t = token_t(token_type_t::MOV, sym.substr(4)); - } - else if (initial_match(sym, "DUP.")) - { - t = token_t(token_type_t::DUP, sym.substr(4)); - } - else if (initial_match(sym, "MALLOC.STACK.")) - { - t = token_t(token_type_t::MALLOC_STACK, sym.substr(13)); - } - else if (initial_match(sym, "MALLOC.")) - { - t = token_t(token_type_t::MALLOC, sym.substr(7)); - } - else if (initial_match(sym, "MSET.STACK.")) - { - t = token_t(token_type_t::MSET_STACK, sym.substr(11)); - } - else if (initial_match(sym, "MSET.")) - { - t = token_t(token_type_t::MSET, sym.substr(5)); - } - else if (initial_match(sym, "MGET.STACK.")) - { - t = token_t(token_type_t::MGET_STACK, sym.substr(11)); - } - else if (initial_match(sym, "MGET.")) - { - t = token_t(token_type_t::MGET, sym.substr(5)); - } - else if (sym == "MDELETE") - { - t.type = token_type_t::MDELETE; - } - else if (sym == "MSIZE") - { - t.type = token_type_t::MSIZE; - } - else if (initial_match(sym, "NOT.")) - { - t = token_t(token_type_t::NOT, sym.substr(4)); - } - else if (initial_match(sym, "OR.")) - { - t = token_t(token_type_t::OR, sym.substr(3)); - } - else if (initial_match(sym, "AND.")) - { - t = token_t(token_type_t::AND, sym.substr(4)); - } - else if (initial_match(sym, "XOR.")) - { - t = token_t(token_type_t::XOR, sym.substr(4)); - } - else if (initial_match(sym, "EQ.")) - { - t = token_t(token_type_t::EQ, sym.substr(3)); - } - else if (initial_match(sym, "LTE.")) - { - t = token_t(token_type_t::LTE, sym.substr(4)); - } - else if (initial_match(sym, "LT.")) - { - t = token_t(token_type_t::LT, sym.substr(3)); - } - else if (initial_match(sym, "GTE.")) - { - t = token_t(token_type_t::GTE, sym.substr(4)); - } - else if (initial_match(sym, "GT.")) - { - t = token_t(token_type_t::GT, sym.substr(3)); - } - else if (initial_match(sym, "SUB.")) - { - t = token_t(token_type_t::SUB, sym.substr(4)); - } - else if (initial_match(sym, "PLUS.")) - { - t = token_t(token_type_t::PLUS, sym.substr(5)); - } - else if (initial_match(sym, "MULT.")) - { - t = token_t(token_type_t::MULT, sym.substr(5)); - } - else if (initial_match(sym, "PRINT.")) - { - t = token_t(token_type_t::PRINT, sym.substr(6)); - } - else if (sym == "JUMP.ABS") - { - t.type = token_type_t::JUMP_ABS; - } - else if (sym == "JUMP.STACK") - { - t.type = token_type_t::JUMP_STACK; - } - else if (initial_match(sym, "JUMP.IF.")) - { - t = token_t(token_type_t::JUMP_IF, sym.substr(8)); - } - else if (sym == "CALL.STACK") - { - t.type = token_type_t::CALL_STACK; - } - else if (sym == "CALL") - { - t.type = token_type_t::CALL; - } - else if (sym == "RET") - { - t.type = token_type_t::RET; - } - else if (sym == "GLOBAL") - { - t.type = token_type_t::GLOBAL; - } - else - { - t.type = token_type_t::SYMBOL; - } - - if (t.content == "") - t.content = sym; - t.column = column; - column += sym.size() - 1; - return make_pair(t, lerr_t()); -} - -token_t tokenise_literal_number(string_view &source, size_t &column) -{ - bool is_negative = false; - if (source[0] == '-') - { - is_negative = true; - source.remove_prefix(1); - } - - auto end = source.find_first_not_of(VALID_DIGIT); - if (end == string::npos) - end = source.size() - 1; - string digits{source.substr(0, end)}; - source.remove_prefix(end); - - token_t t{token_type_t::LITERAL_NUMBER, (is_negative ? "-" : "") + digits, - column}; - - column += digits.size() + (is_negative ? 1 : 0); - - return t; -} - -token_t tokenise_literal_hex(string_view &source, size_t &column) -{ - // Remove x char from source - source.remove_prefix(1); - auto end = source.find_first_not_of(VALID_HEX); - if (end == string::npos) - end = source.size() - 1; - string digits{source.substr(0, end)}; - source.remove_prefix(end); - - token_t t = {token_type_t::LITERAL_NUMBER, "0x" + digits, column}; - - column += digits.size() + 1; - return t; -} - -pair tokenise_literal_char(string_view &source, size_t &column, - size_t &line) -{ - token_t t{}; - auto end = source.find('\'', 1); - if (source.size() < 3 || end == 1 || end > 3) - return make_pair(t, - lerr_t(lerr_type_t::INVALID_CHAR_LITERAL, column, line)); - else if (source[1] == '\\') - { - // Escape sequence - char escape = '\0'; - if (source.size() < 4 || source[3] != '\'') - return make_pair(t, - lerr_t(lerr_type_t::INVALID_CHAR_LITERAL_ESCAPE_SEQUENCE, - column, line)); - switch (source[2]) - { - case 'n': - escape = '\n'; - break; - case 't': - escape = '\t'; - break; - case 'r': - escape = '\r'; - break; - case '\\': - escape = '\\'; - break; - default: - column += 2; - return make_pair(t, - lerr_t(lerr_type_t::INVALID_CHAR_LITERAL_ESCAPE_SEQUENCE, - column, line)); - break; - } - t = token_t{token_type_t::LITERAL_CHAR, std::to_string(escape), column}; - column += 4; - source.remove_prefix(4); - } - else - { - t = token_t(token_type_t::LITERAL_CHAR, std::to_string(source[1])); - column += 3; - source.remove_prefix(3); - } - return make_pair(t, lerr_t()); -} - -token_t tokenise_literal_string(string_view &source, size_t &column, size_t end) -{ - source.remove_prefix(1); - token_t token{token_type_t::LITERAL_STRING, string(source.substr(0, end - 1)), - column}; - source.remove_prefix(end); - column += end + 1; - return token; -} - -lerr_t tokenise_buffer(string_view source, std::vector &tokens) -{ - size_t column = 0, line = 1; - while (source.size() > 0) - { - bool is_token = true; - char first = source[0]; - token_t t{}; - if (isspace(first) || first == '\0') - { - size_t i; - for (i = 0; - i < source.size() && (isspace(source[i]) || source[i] == '\0'); ++i) - { - ++column; - if (source[i] == '\n') - { - column = 0; - ++line; - } - } - ++column; - source.remove_prefix(i); - is_token = false; - } - else if (first == ';') - { - size_t i; - for (i = 0; i < source.size() && source[i] != '\n'; ++i) - continue; - column = 0; - ++line; - source.remove_prefix(i + 1); - is_token = false; - } - else if (first == '*') - { - t = token_t(token_type_t::STAR, "", column); - source.remove_prefix(1); - } - else if (first == '\"') - { - auto end = source.find('\"', 1); - if (end == string::npos) - return lerr_t(lerr_type_t::INVALID_STRING_LITERAL, column, line); - t = tokenise_literal_string(source, column, end); - } - else if (first == '\'') - { - lerr_t lerr; - std::tie(t, lerr) = tokenise_literal_char(source, column, line); - if (lerr.type != lerr_type_t::OK) - return lerr; - } - else if (isdigit(first) || - (source.size() > 1 && first == '-' && isdigit(source[1]))) - { - auto end = source.find_first_not_of(VALID_DIGIT, first == '-' ? 1 : 0); - if (end == string::npos) - end = source.size() - 1; - else if (end != string::npos && !(isspace(source[end]))) - return lerr_t(lerr_type_t::INVALID_NUMBER_LITERAL, column, line); - t = tokenise_literal_number(source, column); - } - else if (first == '0' && source.size() > 2 && source[1] == 'x' && - is_char_in_s(source[2], VALID_HEX)) - { - auto end = source.find_first_not_of(VALID_HEX); - if (end == string::npos) - end = source.size() - 1; - else if (end != string::npos && !(isspace(source[end]))) - return lerr_t(lerr_type_t::INVALID_NUMBER_LITERAL, column, line); - t = tokenise_literal_hex(source, column); - } - else if (is_char_in_s(first, VALID_SYMBOL)) - { - lerr_t lerr; - std::tie(t, lerr) = tokenise_symbol(source, column, line); - if (lerr.type != lerr_type_t::OK) - return lerr; - } - else - { - ++column; - return lerr_t{lerr_type_t::UNKNOWN_LEXEME, column, line}; - } - - if (is_token) - { - t.line = line; - token_t *acc = new token_t(t); - tokens.push_back(acc); - } - } - return lerr_t{}; -} - -std::ostream &operator<<(std::ostream &os, token_t &t) -{ - return os << token_type_as_cstr(t.type) << "(`" << t.content << "`)@" - << t.line << ", " << t.column; -} - -token_t::token_t() -{} - -token_t::token_t(token_type_t type, string content, size_t col, size_t line) - : type{type}, column{col}, line{line}, content{content} -{} - -const char *token_type_as_cstr(token_type_t type) -{ - switch (type) - { - case token_type_t::PP_USE: - return "PP_USE"; - case token_type_t::PP_CONST: - return "PP_CONST"; - case token_type_t::PP_END: - return "PP_END"; - case token_type_t::PP_REFERENCE: - return "PP_REFERENCE"; - case token_type_t::GLOBAL: - return "GLOBAL"; - case token_type_t::STAR: - return "STAR"; - case token_type_t::LITERAL_STRING: - return "LITERAL_STRING"; - case token_type_t::LITERAL_NUMBER: - return "LITERAL_NUMBER"; - case token_type_t::LITERAL_CHAR: - return "LITERAL_CHAR"; - case token_type_t::NOOP: - return "NOOP"; - case token_type_t::HALT: - return "HALT"; - case token_type_t::PUSH: - return "PUSH"; - case token_type_t::POP: - return "POP"; - case token_type_t::PUSH_REG: - return "PUSH_REG"; - case token_type_t::MOV: - return "MOV"; - case token_type_t::DUP: - return "DUP"; - case token_type_t::MALLOC: - return "MALLOC"; - case token_type_t::MALLOC_STACK: - return "MALLOC_STACK"; - case token_type_t::MSET: - return "MSET"; - case token_type_t::MSET_STACK: - return "MSET_STACK"; - case token_type_t::MGET: - return "MGET"; - case token_type_t::MGET_STACK: - return "MGET_STACK"; - case token_type_t::MDELETE: - return "MDELETE"; - case token_type_t::MSIZE: - return "MSIZE"; - case token_type_t::NOT: - return "NOT"; - case token_type_t::OR: - return "OR"; - case token_type_t::AND: - return "AND"; - case token_type_t::XOR: - return "XOR"; - case token_type_t::EQ: - return "EQ"; - case token_type_t::LT: - return "LT"; - case token_type_t::LTE: - return "LTE"; - case token_type_t::GT: - return "GT"; - case token_type_t::GTE: - return "GTE"; - case token_type_t::PLUS: - return "PLUS"; - case token_type_t::SUB: - return "SUB"; - case token_type_t::MULT: - return "MULT"; - case token_type_t::PRINT: - return "PRINT"; - case token_type_t::JUMP_ABS: - return "JUMP_ABS"; - case token_type_t::JUMP_STACK: - return "JUMP_STACK"; - case token_type_t::JUMP_IF: - return "JUMP_IF"; - case token_type_t::CALL: - return "CALL"; - case token_type_t::CALL_STACK: - return "CALL_STACK"; - case token_type_t::RET: - return "RET"; - case token_type_t::SYMBOL: - return "SYMBOL"; - } - return ""; -} - -std::ostream &operator<<(std::ostream &os, lerr_t &lerr) -{ - os << lerr.line << ":" << lerr.col << ": "; - switch (lerr.type) - { - case lerr_type_t::OK: - os << "OK"; - break; - case lerr_type_t::INVALID_CHAR_LITERAL: - os << "INVALID_CHAR_LITERAL"; - break; - case lerr_type_t::INVALID_CHAR_LITERAL_ESCAPE_SEQUENCE: - os << "INVALID_CHAR_LITERAL_ESCAPE_SEQUENCE"; - break; - case lerr_type_t::INVALID_STRING_LITERAL: - os << "INVALID_STRING_LITERAL"; - break; - case lerr_type_t::INVALID_NUMBER_LITERAL: - os << "INVALID_NUMBER_LITERAL"; - break; - case lerr_type_t::INVALID_PREPROCESSOR_DIRECTIVE: - os << "INVALID_PREPROCESSOR_DIRECTIVE"; - break; - case lerr_type_t::UNKNOWN_LEXEME: - os << "UNKNOWN_LEXEME"; - break; - default: - break; - } - return os; -} - -lerr_t::lerr_t(lerr_type_t type, size_t col, size_t line) - : col{col}, line{line}, type{type} -{} diff --git a/asm/lexer.hpp b/asm/lexer.hpp deleted file mode 100644 index 4c4889c..0000000 --- a/asm/lexer.hpp +++ /dev/null @@ -1,106 +0,0 @@ -/* Copyright (C) 2024 Aryadev Chavali - - * You may distribute and modify this code under the terms of the - * GPLv2 license. You should have received a copy of the GPLv2 - * license with this file. If not, please write to: - * aryadev@aryadevchavali.com. - - * Created: 2024-04-14 - * Author: Aryadev Chavali - * Description: Lexer for assembly language - */ - -#ifndef LEXER_HPP -#define LEXER_HPP - -#include -#include -#include -#include - -enum class token_type_t -{ - PP_CONST, // %const()... - PP_USE, // %use - PP_END, // %end - PP_REFERENCE, // $ - GLOBAL, - STAR, - LITERAL_NUMBER, - LITERAL_CHAR, - LITERAL_STRING, - NOOP, - HALT, - PUSH, - POP, - PUSH_REG, - MOV, - DUP, - MALLOC, - MALLOC_STACK, - MSET, - MSET_STACK, - MGET, - MGET_STACK, - MDELETE, - MSIZE, - NOT, - OR, - AND, - XOR, - EQ, - LT, - LTE, - GT, - GTE, - PLUS, - SUB, - MULT, - PRINT, - JUMP_ABS, - JUMP_STACK, - JUMP_IF, - CALL, - CALL_STACK, - RET, - SYMBOL, -}; - -const char *token_type_as_cstr(token_type_t type); - -struct token_t -{ - token_type_t type; - size_t column, line; - std::string content; - - token_t(); - token_t(token_type_t, std::string, size_t col = 0, size_t line = 0); -}; - -std::ostream &operator<<(std::ostream &, token_t &); - -enum class lerr_type_t -{ - OK = 0, - INVALID_CHAR_LITERAL, - INVALID_CHAR_LITERAL_ESCAPE_SEQUENCE, - INVALID_STRING_LITERAL, - INVALID_NUMBER_LITERAL, - INVALID_PREPROCESSOR_DIRECTIVE, - UNKNOWN_LEXEME, -}; - -struct lerr_t -{ - size_t col, line; - lerr_type_t type; - - lerr_t(lerr_type_t type = lerr_type_t::OK, size_t col = 0, size_t line = 0); -}; - -std::ostream &operator<<(std::ostream &, lerr_t &); - -lerr_t tokenise_buffer(std::string_view, std::vector &); - -#endif diff --git a/asm/main.cpp b/asm/main.cpp deleted file mode 100644 index 309bcb6..0000000 --- a/asm/main.cpp +++ /dev/null @@ -1,148 +0,0 @@ -/* Copyright (C) 2024 Aryadev Chavali - - * You may distribute and modify this code under the terms of the - * GPLv2 license. You should have received a copy of the GPLv2 - * license with this file. If not, please write to: - * aryadev@aryadevchavali.com. - - * Created: 2024-04-14 - * Author: Aryadev Chavali - * Description: Entrypoint for assembly program - */ - -#include -#include -#include -#include -#include -#include -#include - -extern "C" -{ -#include -} - -#include "./base.hpp" -#include "./lexer.hpp" -#include "./preprocesser.hpp" - -using std::cout, std::cerr, std::endl; -using std::pair, std::string, std::string_view, std::vector; - -void usage(const char *program_name, FILE *fp) -{ - fprintf(fp, - "Usage: %s FILE OUT-FILE\n" - "\tFILE: Source code to compile\n" - "\tOUT-FILE: Name of file to store bytecode\n", - program_name); -} - -int main(int argc, const char *argv[]) -{ - if (argc == 1 || argc > 3) - { - usage(argv[0], stderr); - return -1; - } - int ret = 0; - const char *source_name = argv[1]; - const char *out_name = argv[2]; - (void)out_name; - -#if VERBOSE >= 1 - printf("[%sASSEMBLER%s]: Assembling `%s` to `%s`\n", TERM_YELLOW, TERM_RESET, - source_name, out_name); -#endif - - auto file_source = read_file(source_name); - -#if VERBOSE >= 1 - printf("[%sASSEMBLER%s]: `%s` -> %lu bytes\n", TERM_YELLOW, TERM_RESET, - source_name, file_source.has_value() ? file_source.value().size() : 0); -#endif - - string source_str; - string_view original; - string_view src; - vector tokens, preprocessed_tokens; - lerr_t lerr; - pp_err_t pp_err; - - // Highest scoped variable cut off point - - if (file_source.has_value()) - source_str = file_source.value(); - else - { - cerr << "ERROR: file `" << source_name << "` does not exist!" << endl; - ret = -1; - goto end; - } - original = string_view{source_str}; - src = string_view{source_str}; - lerr = tokenise_buffer(src, tokens); - - if (lerr.type != lerr_type_t::OK) - { - cerr << source_name << ":" << lerr << endl; - ret = 255 - static_cast(lerr.type); - goto end; - } - else - { - -#if VERBOSE >= 1 - printf("[%sLEXER%s]: %lu bytes -> %lu tokens\n", TERM_GREEN, TERM_RESET, - source_str.size(), tokens.size()); -#endif - -#if VERBOSE == 2 - printf("[%sLEXER%s]: Tokens " - "parsed:\n----------------------------------------------------------" - "----------------------\n", - TERM_GREEN, TERM_RESET); - for (auto token : tokens) - cout << "\t" << *token << endl; - printf("-------------------------------------------------------------" - "-------------------\n"); -#endif - } - - // preprocessing - pp_err = preprocesser(tokens, preprocessed_tokens); - if (pp_err.type != pp_err_type_t::OK) - { - cerr << source_name << ":" << pp_err.reference->line << ":" - << pp_err.reference->column << ": " << pp_err << endl; - ret = 255 - static_cast(pp_err.type); - goto end; - } - else - { - -#if VERBOSE >= 1 - printf("[%sPREPROCESSOR%s]: %lu tokens -> %lu tokens\n", TERM_GREEN, - TERM_RESET, tokens.size(), preprocessed_tokens.size()); -#endif -#if VERBOSE == 2 - printf("[%sPREPROCESSOR%s]: Processed tokens: " - "\n-----------------------------------------------------------------" - "---------------\n", - TERM_GREEN, TERM_RESET); - for (auto token : preprocessed_tokens) - cout << "\t" << *token << endl; - printf("-------------------------------------------------------------" - "-------------------\n"); -#endif - } - -end: - for (auto token : tokens) - delete token; - for (auto token : preprocessed_tokens) - delete token; - - return ret; -} diff --git a/asm/preprocesser.cpp b/asm/preprocesser.cpp deleted file mode 100644 index 7f52e40..0000000 --- a/asm/preprocesser.cpp +++ /dev/null @@ -1,218 +0,0 @@ -/* Copyright (C) 2024 Aryadev Chavali - - * You may distribute and modify this code under the terms of the - * GPLv2 license. You should have received a copy of the GPLv2 - * license with this file. If not, please write to: - * aryadev@aryadevchavali.com. - - * Created: 2024-04-14 - * Author: Aryadev Chavali - * Description: Preprocessor which occurs after lexing before parsing. - */ - -#include "./preprocesser.hpp" -#include "./base.hpp" - -#include -#include - -using std::pair, std::vector, std::make_pair, std::string, std::string_view; - -#define VCLEAR(V) \ - std::for_each((V).begin(), (V).end(), \ - [](token_t *t) \ - { \ - delete t; \ - }); - -pp_err_t preprocess_use_blocks(const vector &tokens, - vector &vec_out) -{ - for (size_t i = 0; i < tokens.size(); ++i) - { - token_t *t = tokens[i]; - if (t->type == token_type_t::PP_USE) - { - if (i + 1 >= tokens.size() || - tokens[i + 1]->type != token_type_t::LITERAL_STRING) - { - VCLEAR(vec_out); - vec_out.clear(); - return pp_err_t(pp_err_type_t::EXPECTED_STRING, t); - } - - token_t *name = tokens[i + 1]; - auto source = read_file(name->content.c_str()); - if (!source) - { - VCLEAR(vec_out); - vec_out.clear(); - return pp_err_t(pp_err_type_t::FILE_NONEXISTENT, name); - } - - std::vector ftokens; - lerr_t lerr = tokenise_buffer(source.value(), ftokens); - if (lerr.type != lerr_type_t::OK) - { - VCLEAR(vec_out); - vec_out.clear(); - return pp_err_t(pp_err_type_t::FILE_PARSE_ERROR, name, lerr); - } - - vec_out.insert(vec_out.end(), ftokens.begin(), ftokens.end()); - - ++i; - } - else - vec_out.push_back(new token_t{*t}); - } - return pp_err_t(); -} - -struct const_t -{ - size_t start, end; -}; - -pp_err_t preprocess_const_blocks(const vector &tokens, - vector &vec_out) -{ - std::unordered_map blocks; - for (size_t i = 0; i < tokens.size(); ++i) - { - token_t *t = tokens[i]; - if (t->type == token_type_t::PP_CONST) - { - string_view capture; - if (i + 1 >= tokens.size() || tokens[i + 1]->type != token_type_t::SYMBOL) - return pp_err_type_t::EXPECTED_NAME; - - capture = tokens[++i]->content; - - ++i; - size_t block_start = i, block_end = 0; - for (; i < tokens.size() && tokens[i]->type != token_type_t::PP_END; ++i) - continue; - - if (i == tokens.size()) - return pp_err_t{pp_err_type_t::EXPECTED_END}; - - block_end = i; - - blocks[capture] = const_t{block_start, block_end}; - } - } - - if (blocks.size() == 0) - { - // Just construct a new vector and carry on - for (token_t *token : tokens) - vec_out.push_back(new token_t{*token}); - } - else - { - for (size_t i = 0; i < tokens.size(); ++i) - { - token_t *token = tokens[i]; - // Skip the tokens that construct the const - if (token->type == token_type_t::PP_CONST) - for (; i < tokens.size() && tokens[i]->type != token_type_t::PP_END; - ++i) - continue; - else if (token->type == token_type_t::PP_REFERENCE) - { - auto it = blocks.find(token->content); - if (it == blocks.end()) - { - VCLEAR(vec_out); - vec_out.clear(); - return pp_err_t(pp_err_type_t::UNKNOWN_NAME, token); - } - - const_t block = it->second; - for (size_t i = block.start; i < block.end; ++i) - vec_out.push_back(new token_t{*tokens[i]}); - } - else - vec_out.push_back(new token_t{*token}); - } - } - - return pp_err_t(); -} - -pp_err_t preprocesser(const vector &tokens, - vector &vec_out) -{ - vector use_block_tokens; - pp_err_t pperr = preprocess_use_blocks(tokens, use_block_tokens); - if (pperr.type != pp_err_type_t::OK) - { - vec_out = tokens; - return pperr; - } - - vector const_block_tokens; - pperr = preprocess_const_blocks(use_block_tokens, const_block_tokens); - if (pperr.type != pp_err_type_t::OK) - { - VCLEAR(tokens); - vec_out = use_block_tokens; - return pperr; - } - - VCLEAR(use_block_tokens); - vec_out = const_block_tokens; - - return pp_err_t{pp_err_type_t::OK}; -} - -// TODO: Implement this -pp_err_t preprocess_macro_blocks(const vector &, - vector &); - -std::ostream &operator<<(std::ostream &os, pp_err_t &err) -{ - os << "PREPROCESSING_"; - switch (err.type) - { - case OK: - return os << "OK"; - case EXPECTED_NAME: - return os << "EXPECTED_NAME"; - case EXPECTED_STRING: - return os << "EXPECTED_STRING"; - case EXPECTED_END: - return os << "EXPECTED_END"; - case FILE_NONEXISTENT: - return os << "FILE_NONEXISTENT"; - case FILE_PARSE_ERROR: - return os << "FILE_PARSE_ERROR -> \n\t[" << err.reference->content - << "]:" << err.lerr; - case UNKNOWN_NAME: - return os << "UNKNOWN_NAME"; - } - return os; -} - -pp_err_t::pp_err_t() : reference{nullptr}, type{pp_err_type_t::OK}, lerr{} -{} - -pp_err_t::pp_err_t(pp_err_type_t e) : reference{nullptr}, type{e}, lerr{} -{} - -pp_err_t::pp_err_t(pp_err_type_t err, const token_t *ref) - : reference{ref}, type{err} -{} - -pp_err_t::pp_err_t(pp_err_type_t err, const token_t *ref, lerr_t lerr) - : reference{ref}, type{err}, lerr{lerr} -{} - -// pp_unit_t::pp_unit_t(const token_t *const token) : resolved{false}, -// token{token} -// {} - -// pp_unit_t::pp_unit_t(std::string_view name, std::vector elements) -// : resolved{false}, token{nullptr}, container{name, elements} -// {} diff --git a/asm/preprocesser.hpp b/asm/preprocesser.hpp deleted file mode 100644 index 4938d4e..0000000 --- a/asm/preprocesser.hpp +++ /dev/null @@ -1,62 +0,0 @@ -/* Copyright (C) 2024 Aryadev Chavali - - * You may distribute and modify this code under the terms of the GPLv2 - * license. You should have received a copy of the GPLv2 license with - * this file. If not, please write to: aryadev@aryadevchavali.com. - - * Created: 2024-04-14 - * Author: Aryadev Chavali - * Description: Preprocessor which occurs after lexing before parsing. - */ - -#ifndef PREPROCESSER_HPP -#define PREPROCESSER_HPP - -#include -#include - -#include "./lexer.hpp" - -enum pp_err_type_t -{ - OK = 0, - EXPECTED_NAME, - EXPECTED_STRING, - EXPECTED_END, - FILE_NONEXISTENT, - FILE_PARSE_ERROR, - UNKNOWN_NAME, -}; - -struct pp_err_t -{ - const token_t *reference; - pp_err_type_t type; - lerr_t lerr; - - pp_err_t(); - pp_err_t(pp_err_type_t); - pp_err_t(pp_err_type_t, const token_t *); - pp_err_t(pp_err_type_t, const token_t *, lerr_t); -}; - -std::ostream &operator<<(std::ostream &, pp_err_t &); - -struct pp_unit_t -{ - const token_t *const token; - struct - { - std::string_view name; - std::vector elements; - } container; - - pp_unit_t(const token_t *const); - pp_unit_t(std::string_view, std::vector); -}; - -std::vector tokens_to_units(const std::vector &); -pp_err_t preprocess_use(std::vector &); -pp_err_t preprocesser(const std::vector &, std::vector &); - -#endif diff --git a/examples/factorial.asm b/examples/factorial.asm deleted file mode 100644 index c4257e4..0000000 --- a/examples/factorial.asm +++ /dev/null @@ -1,54 +0,0 @@ -;;; factorial.asm: A program that generates the factorials of each -;;; number from 1 to 20. Using the registers to store `n` and `n!`. - - ;; Constants - ;; Choice of 20 was not arbitrary; log(20!) ~= 61 while log(21!) ~= - ;; 65 which means that past 20! results are truncated and therefore - ;; the program produces inaccurate factorials. - - %const limit 20 %end - - ;; Setup entrypoint - global main -main: - ;; $I -> W[0] = 1, $J -> W[1] = 1 - push.word 1 - mov.word 0 - push.word 1 - mov.word 1 - - ;; Print `$I: $J` -loopback: - push.byte '\t' - print.char - push.reg.word 0 - print.word - push.byte ':' - print.char - push.byte ' ' - print.char - push.reg.word 1 - print.word - push.byte '\n' - print.char - - ;; $I += 1 - push.reg.word 0 - push.word 1 - plus.word - mov.word 0 - - ;; $J *= $I - push.reg.word 0 - push.reg.word 1 - mult.word - mov.word 1 - - ;; IF $I >= $LIMIT ... - push.word $limit - push.reg.word 0 - gte.word - ;; THEN jump to `loopback` - jump.if.byte loopback - ;; ELSE halt - halt diff --git a/examples/fib.asm b/examples/fib.asm deleted file mode 100644 index 7f4c360..0000000 --- a/examples/fib.asm +++ /dev/null @@ -1,92 +0,0 @@ -;;; fib.asm: A program that generates the fibonacci numbers up to a -;;; very large bound (~UINT64_MAX). Using the registers to store the -;;; pairs of fibonacci numbers, we ensure only a finite amount of -;;; memory is necessary for this program to function, unlike a pure -;;; stack version. - - ;; Constants - %const limit 93 %end - - %const increment_i - push.reg.word 2 - push.word 1 - plus.word - mov.word 2 - %end - - %const print_i - push.reg.word 2 - print.word - %end - - %const print_reg_0 - push.reg.word 0 - print.word - %end - - %const print_reg_1 - push.reg.word 1 - print.word - %end - - ;; Setup entrypoint - global main -main: - ;; Setup iterator I - push.word 1 - mov.word 2 - ;; Setup initial A -> W[0] = 1 and B -> W[1] = 1 - push.word 1 - mov.word 0 - push.word 1 - mov.word 1 - - ;; Print "$I: $A" and "($I + 1): $B" -loopback: - call print_pair - - ;; $A += $B - push.reg.word 0 - push.reg.word 1 - plus.word - mov.word 0 - - ;; $B += $A - push.reg.word 0 - push.reg.word 1 - plus.word - mov.word 1 - - ;; IF $I < $LIMIT ... - push.reg.word 2 - push.word $limit - lt.word - ;; THEN jump to `loopback` - jump.if.byte loopback - ;; ELSE halt - halt - -print_pair: - push.byte '\t' - print.char - $print_i - push.byte ':' - print.char - push.byte ' ' - print.char - $print_reg_0 - push.byte '\n' - print.char - $increment_i - push.byte '\t' - print.char - $print_i - push.byte ':' - print.char - push.byte ' ' - print.char - $print_reg_1 - push.byte '\n' - print.char - $increment_i - ret diff --git a/examples/instruction-test.asm b/examples/instruction-test.asm deleted file mode 100644 index bd31c68..0000000 --- a/examples/instruction-test.asm +++ /dev/null @@ -1,95 +0,0 @@ -;;; instruction-test.asm: A file that contains all possible opcodes in -;;; order, with proper calling convention. Used to test lexer and -;;; parser but isn't a semantically correct program, but may be run as -;;; first instruction is halt (so program will stop immediately). - - ;; setup entrypoint - global main -main: - halt - push.byte 1 - push.hword 2 - push.word 3 - pop.byte - pop.hword - pop.word - push.reg.byte 1 - push.reg.hword 2 - push.reg.word 3 - mov.byte 1 - mov.hword 2 - mov.word 3 - dup.byte 1 - dup.hword 2 - dup.word 3 - malloc.byte 1 - malloc.hword 2 - malloc.word 3 - malloc.stack.byte - malloc.stack.hword - malloc.stack.word - mset.byte 1 - mset.hword 2 - mset.word 3 - mset.stack.byte - mset.stack.hword - mset.stack.word - mget.byte 1 - mget.hword 2 - mget.word 3 - mget.stack.byte - mget.stack.hword - mget.stack.word - not.byte - not.hword - not.word - or.byte - or.hword - or.word - and.byte - and.hword - and.word - xor.byte - xor.hword - xor.word - eq.byte - eq.hword - eq.word - plus.byte - plus.hword - plus.word - sub.byte - sub.hword - sub.word - print.char - print.byte - print.int - print.hword - print.long - print.word - jump.abs 1 - jump.stack - jump.if.byte 1 - jump.if.hword 2 - jump.if.word 3 - - ;; Testing if overflows work correctly - ;; Format is: - ;; -1 All bits are turned on - ;; UINT_MAX All bits are turned on - ;; INT_MAX All bits but the most significant are on - ;; INT_MIN Only the most significant bit is on - push.byte -1 - push.byte 255 - push.byte 127 - push.byte -128 - - push.hword -1 - push.hword 4294967295 - push.hword 2147483647 - push.hword -2147483648 - - push.word -1 - push.word 18446744073709551615 - push.word 9223372036854775807 - push.word -9223372036854775808 diff --git a/examples/memory-print.asm b/examples/memory-print.asm deleted file mode 100644 index cebd6e7..0000000 --- a/examples/memory-print.asm +++ /dev/null @@ -1,65 +0,0 @@ -;;; memory-print: An example program that features a subroutine for -;;; printing a memory buffer, of any length, as characters. - - ;; Setup label for entrypoint - global main -main: - ;; Allocate a buffer of 3 characters - malloc.byte 3 - mov.word 0 - ;; Setup the buffer to be equivalent to "abc" - push.reg.word 0 - push.byte 'a' - mset.byte 0 - push.reg.word 0 - push.byte 'b' - mset.byte 1 - push.reg.word 0 - push.byte 'c' - mset.byte 2 - - ;; Save buffer to W[8] because the first 8 registers should be - ;; reserved for library routines as it may be overwritten - push.reg.word 0 - mov.word 8 - ;; Call the routine - call print_cptr - - ;; Delete allocated buffer - push.reg.word 8 - mdelete - - halt - -;;; print_cptr: Prints pointer to a buffer of characters. Pointer -;;; should be on the stack as a word. -print_cptr: - ;; iterator I -> W[1] - push.word 0 - mov.word 1 - ;; (W[0])[W[1]] -> P[I] -loopback: - push.reg.word 0 - push.reg.word 1 - mget.stack.byte - print.char - - ;; I += 1 - push.reg.word 1 - push.word 1 - plus.word - mov.word 1 - - ;; if I != |P| ... - push.reg.word 1 - push.reg.word 0 - msize - eq.word - not.byte - ;; then go to `loopback` - jump.if.byte loopback - ;; else print a newline - push.byte '\n' - print.char - ;; return back to the caller - ret diff --git a/todo.org b/todo.org index 4dc64e2..7c564b2 100644 --- a/todo.org +++ b/todo.org @@ -10,200 +10,13 @@ **** DONE lib/darr.h **** TODO lib/heap.h **** TODO lib/inst.h -*** TODO ASM [0%] -**** TODO asm/lexer.h -**** TODO asm/parser.h *** TODO VM [0%] **** TODO vm/runtime.h ** TODO Specification -* TODO Preprocessing directives :ASM: -Like in FASM or NASM where we can give certain helpful instructions to -the assembler. I'd use the ~%~ symbol to designate preprocessor -directives. -** TODO Macros -Essentially constants expressions which take literal parameters -(i.e. tokens) and can use them throughout the body. Something like -#+begin_src asm -%macro(name)(param1 param2 param3) -... -%end -#+end_src -Where each parameter is substituted in a call at preprocessing time. -A call should look something like this: -#+begin_src asm - $name 1 2 3 -#+end_src -and those tokens will be substituted literally in the macro body. -* WIP Write assembler in a different language :ASM: -While the runtime and base library needs to deal with only -binary, the assembler has to deal with string inputs and a larger -variety of bugs. As the base library is written in C, and is all that -is necessary to write a program that targets the virtual machine, we -could realistically use another language to write the assembler in via -FFI with minimal pain. - -Languages in the competition: -+ C++ -+ Rust -+ Python - -2024-04-14: Chose C++ cos it will require the least effort to rewrite -the currently existing codebase while still leveraging some less -efficient but incredibly useful features. -* TODO Rewrite preprocesser to create a custom unit instead of token streams -** Problem -A problem that occurs in the preprocessor is token column and line -count. Say =a.asm= has ~%use "b.asm"~. The tokens from the =b.asm= -file are inserted into =a.asm='s token stream, but the line/column -count from there isn't properly set in =a.asm=. - -A naive solution would be to just recount the lines and columns, but -this removes information about where those tokens came from. Say an -error occurs in some of =b.asm='s code: I would like to be able to -report them. - -Therefore, we can no longer just generate new token streams from the -preprocesser and should instead look at making more complex -abstractions. - -A problem this could also solve is nested errors and recursive -constants. Say I have some assembly like so -#+begin_src asm - %const limit 20 %end - %const print-limit - ... - push.byte $limit - print.byte - ... - %end -#+end_src - -A call to ~print-limit~ under the current system would insert the -tokens for print-limit but completely forget about ~push.byte $limit~ -which would cause a parsing error. (This could be fixed under the -current system by allowing reference resolution inside of const -blocks, with the conceit that it would be hard to stop infinite recursion) -** Language model -The model I have in mind is that all constructs in this meta language -(the preprocessing language) are either singular tokens or collections -of tokens/constructs in a recursive sense. This naturally follows -from the fact that a single pass isn't enough to properly parse this -language: there must be some recursive nature which forces the -language to take multiple passes to completely generate a stream that -can be parsed. - -This vague notion can be formalised like so. A preprocessing unit is -either a singular token or a named collection of units. The former -represents your standard symbols and literals while the later -represents ~%const~ and ~%use~ calls where there is a clear name -associated to a collection of one or more tokens (in the case of the -former it's the constant's name and the latter it's the filename). -We'll distinguish this as well. - -#+begin_src text -Token = PP_USE | PP_CONST | String(Content) | Symbol(Content) | PUSH(Content) | ... -Type = File(String) | Constant(Symbol) -Unit = Token | Container(Type . Vector[Unit]) -#+end_src - -Through this model our initial stream of tokens can be considered -units. We can already see that this model may solve our original -problem: with named containers it doesn't matter that certain tokens -are from different parts of the file or different files as they are -distinctly typed from the general set of tokens, with a name which -states where they're from. -** Processing -We need this model to have a notion of "processing" though, otherwise -it's quite useless. A processing function is simply a function which -takes a unit and returns another unit. We currently have two -processing functions we can consider: ~process_const~ and -~process_use~. - -~process_use~ takes a vector of tokens and, upon encountering PP_USE -accepts the next token (a string) and tokenises the file -with that name. Within our model we'd make the stream of tokens -created from opening the file a /container/. - -~process_const~ takes a vector of tokens and does two things in an -iteration: -1) upon encountering PP_CONST accepts the next n tokens till PP_END is - encountered, with the first token being a symbol. This is - registered in a map of constants (~CONSTS~) where the symbol is the - key and the value associated is the n - 1 tokens accepted -2) upon encountering a PP_REFERENCE reads the content associated with - it (considered a symbol ~S~) and replaces it ~CONSTS[S]~ (if S is - in CONSTS). - -One thing to note is that both of these definitions are easily -extensible to the general definition of units: if a unit is a -container of some kind we can recur through its vector of units to -resolve any further "calls". For ~process_const~ it's ~%const~ or -~$ref~ while for ~process_use~ it's ~%use~. -** History/versioning -One additional facet to this model I'd like to add is "history". Each -unit is actually a list (or a singly linked tree where each parent has -at most one child) of sub-units where the top of the list represents -the current version. Each descendant is a previous version of the -token. - -Say I do some processing on an element of the unit list =a= (with -index =i=) such that it becomes a new "unit", call it =b=. Then we -update V by =V[i] = cons(b, a)=. Through this, the lists acts as a -history of processing that has occurred on the unit. This provides an -ability to trace the path of preprocessing to an eventual conclusion. - -Processing occurs on a unit until it cannot be done further i.e. when -there are no more "calls" in the tree to resolve. The history list -provides all the versions of a unit till its resolved form. - -To see what a unit with history may look like (where symbols are -terminals i.e. completely resolved): -+ Container('limit' . [a Container("b" . d e f) c]) - + Container('limit' . [a '$b' c]) - + Token(PP_REF('$limit')) - -This shows resolution of the unit reference ~$limit~, which in turn -leads to the resolution of ~$b~ which is a sub-unit. - -There are two ways of indefinite resolution, one per method of -processing. For ~process_use~ it is two files calling ~%use~ on each -other and for ~process_const~ it is a ~%const~ calling itself. We can -just disallow it through analysis. -** Pseudocode -#+begin_src text -process_use(V: Vector[Unit]) -> - [cons((if v is Token(PP_USE) and next(v) is Token(String(S)) - -> Container(File(S) . tokenise(open(v'))) - else if v is Container(name . units) - -> Container(name . process_use(units)) - else - -> v), - v_x) - v = v_x[0] - for v_x in V] - -CONSTS={} -process_const(V: Vector[Unit]) -> - [cons((if v is Token(PP_CONST) and next(v) is Token(Symbol(S)) - do { - i := find(Token(PP_END), V[v:]) - CONSTS[S] = V[next(v):prev(i)] - -> Container(Constant(S) . CONSTS[S]) - } - else if v is Token(PP_REF(S)) - -> CONSTS[S] - else if v is Container(name . units) - -> Container(name . process_const(units)) - else - -> v) - v_x) - v = v_x[0] - for v_x in V] -#+end_src * TODO Introduce error handling in base library :LIB: There is a large variety of TODOs about errors. Let's fix them! 8 TODOs currently present. -* TODO Standard library :ASM:VM: +* TODO Standard library :VM: I should start considering this and how a user may use it. Should it be an option in the VM and/or assembler binaries (i.e. a flag) or something the user has to specify in their source files?