Compare commits
10 Commits
ba3525d533
...
2a1d006a88
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2a1d006a88 | ||
|
|
8f75241bcb | ||
|
|
d5c43b1c3f | ||
|
|
715facf015 | ||
|
|
4ecd184759 | ||
|
|
27d6a47320 | ||
|
|
3fc1f08134 | ||
|
|
4b3e9b3567 | ||
|
|
05136fdd25 | ||
|
|
1e7f1bdee9 |
56
README.org
56
README.org
@@ -5,6 +5,19 @@
|
||||
A stack based virtual machine in C11, with a dynamic register setup
|
||||
which acts as variable space. Deals primarily in bytes, doesn't make
|
||||
assertions about typing and is very simple to target.
|
||||
|
||||
2024-04-16: Project will now be split into two components
|
||||
1) The runtime + base library
|
||||
2) The assembler
|
||||
|
||||
This will focus each repository on separate issues and make it easier
|
||||
to organize. They will both derive from the same repositories
|
||||
i.e. I'm not making fresh repositories and just sticking the folders
|
||||
in but rather branching this repository into two different versions.
|
||||
|
||||
The two versions will be hosted at:
|
||||
1) [[https://github.com/aryadev-software/avm]]
|
||||
1) [[https://github.com/aryadev-software/aal]]
|
||||
* How to build
|
||||
Requires =GNU make= and a compliant C11 compiler. Code base has been
|
||||
tested against =gcc= and =clang=, but given how the project has been
|
||||
@@ -66,23 +79,32 @@ This is recommended if writing an interpreted language such as a Lisp,
|
||||
where on demand execution of code is more suitable.
|
||||
* Lines of code
|
||||
#+begin_src sh :results table :exports results
|
||||
find -name '*.[ch]' -exec wc -l '{}' ';'
|
||||
wc -lwc $(find -regex ".*\.[ch]\(pp\)?")
|
||||
#+end_src
|
||||
|
||||
#+RESULTS:
|
||||
| 301 | ./vm/runtime.h |
|
||||
| 92 | ./vm/main.c |
|
||||
| 1059 | ./vm/runtime.c |
|
||||
| 500 | ./lib/inst.c |
|
||||
| 39 | ./lib/darr.h |
|
||||
| 265 | ./lib/inst.h |
|
||||
| 42 | ./lib/heap.h |
|
||||
| 90 | ./lib/base.h |
|
||||
| 101 | ./lib/heap.c |
|
||||
| 39 | ./lib/base.c |
|
||||
| 77 | ./lib/darr.c |
|
||||
| 654 | ./asm/parser.c |
|
||||
| 142 | ./asm/main.c |
|
||||
| 83 | ./asm/lexer.h |
|
||||
| 65 | ./asm/parser.h |
|
||||
| 549 | ./asm/lexer.c |
|
||||
| Files | Lines | Words | Bytes |
|
||||
|------------------------+-------+-------+--------|
|
||||
| ./lib/heap.h | 42 | 111 | 801 |
|
||||
| ./lib/inst.c | 516 | 1315 | 13982 |
|
||||
| ./lib/darr.c | 77 | 225 | 1757 |
|
||||
| ./lib/base.c | 107 | 306 | 2002 |
|
||||
| ./lib/inst.h | 108 | 426 | 4067 |
|
||||
| ./lib/prog.h | 176 | 247 | 2616 |
|
||||
| ./lib/base.h | 148 | 626 | 3915 |
|
||||
| ./lib/darr.h | 88 | 465 | 2697 |
|
||||
| ./lib/heap.c | 101 | 270 | 1910 |
|
||||
| ./vm/runtime.h | 301 | 780 | 7965 |
|
||||
| ./vm/runtime.c | 1070 | 3097 | 30010 |
|
||||
| ./vm/main.c | 92 | 265 | 2243 |
|
||||
| ./asm/base.hpp | 21 | 68 | 472 |
|
||||
| ./asm/lexer.cpp | 565 | 1448 | 14067 |
|
||||
| ./asm/base.cpp | 33 | 89 | 705 |
|
||||
| ./asm/parser.hpp | 82 | 199 | 1656 |
|
||||
| ./asm/parser.cpp | 42 | 129 | 1294 |
|
||||
| ./asm/lexer.hpp | 106 | 204 | 1757 |
|
||||
| ./asm/preprocesser.cpp | 218 | 574 | 5800 |
|
||||
| ./asm/preprocesser.hpp | 62 | 147 | 1360 |
|
||||
| ./asm/main.cpp | 148 | 414 | 3791 |
|
||||
|------------------------+-------+-------+--------|
|
||||
| total | 4103 | 11405 | 104867 |
|
||||
|
||||
@@ -25,7 +25,7 @@ static_assert(NUMBER_OF_OPCODES == 98, "ERROR: Lexer is out of date");
|
||||
using std::string, std::string_view, std::pair, std::make_pair;
|
||||
|
||||
const auto VALID_SYMBOL = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUV"
|
||||
"WXYZ0123456789-_.:()%#$",
|
||||
"WXYZ0123456789-_.:%#$",
|
||||
VALID_DIGIT = "0123456789", VALID_HEX = "0123456789abcdefABCDEF";
|
||||
|
||||
bool is_char_in_s(char c, const char *s)
|
||||
@@ -50,9 +50,9 @@ pair<token_t, lerr_t> tokenise_symbol(string_view &source, size_t &column,
|
||||
|
||||
token_t t{};
|
||||
|
||||
if (initial_match(sym, "%CONST"))
|
||||
if (sym == "%CONST")
|
||||
{
|
||||
t = token_t(token_type_t::PP_CONST, sym.substr(6));
|
||||
t.type = token_type_t::PP_CONST;
|
||||
}
|
||||
else if (sym == "%USE")
|
||||
{
|
||||
@@ -406,7 +406,7 @@ lerr_t tokenise_buffer(string_view source, std::vector<token_t *> &tokens)
|
||||
else
|
||||
{
|
||||
++column;
|
||||
return lerr_t{lerr_type_t::UNKNOWN_CHAR, column, line};
|
||||
return lerr_t{lerr_type_t::UNKNOWN_LEXEME, column, line};
|
||||
}
|
||||
|
||||
if (is_token)
|
||||
@@ -551,8 +551,8 @@ std::ostream &operator<<(std::ostream &os, lerr_t &lerr)
|
||||
case lerr_type_t::INVALID_PREPROCESSOR_DIRECTIVE:
|
||||
os << "INVALID_PREPROCESSOR_DIRECTIVE";
|
||||
break;
|
||||
case lerr_type_t::UNKNOWN_CHAR:
|
||||
os << "UNKNOWN_CHAR";
|
||||
case lerr_type_t::UNKNOWN_LEXEME:
|
||||
os << "UNKNOWN_LEXEME";
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
|
||||
@@ -88,7 +88,7 @@ enum class lerr_type_t
|
||||
INVALID_STRING_LITERAL,
|
||||
INVALID_NUMBER_LITERAL,
|
||||
INVALID_PREPROCESSOR_DIRECTIVE,
|
||||
UNKNOWN_CHAR,
|
||||
UNKNOWN_LEXEME,
|
||||
};
|
||||
|
||||
struct lerr_t
|
||||
|
||||
@@ -115,7 +115,7 @@ int main(int argc, const char *argv[])
|
||||
if (pp_err.type != pp_err_type_t::OK)
|
||||
{
|
||||
cerr << source_name << ":" << pp_err.reference->line << ":"
|
||||
<< pp_err.reference->column << ":" << pp_err << endl;
|
||||
<< pp_err.reference->column << ": " << pp_err << endl;
|
||||
ret = 255 - static_cast<int>(pp_err.type);
|
||||
goto end;
|
||||
}
|
||||
|
||||
@@ -37,6 +37,7 @@ pp_err_t preprocess_use_blocks(const vector<token_t *> &tokens,
|
||||
tokens[i + 1]->type != token_type_t::LITERAL_STRING)
|
||||
{
|
||||
VCLEAR(vec_out);
|
||||
vec_out.clear();
|
||||
return pp_err_t(pp_err_type_t::EXPECTED_STRING, t);
|
||||
}
|
||||
|
||||
@@ -45,6 +46,7 @@ pp_err_t preprocess_use_blocks(const vector<token_t *> &tokens,
|
||||
if (!source)
|
||||
{
|
||||
VCLEAR(vec_out);
|
||||
vec_out.clear();
|
||||
return pp_err_t(pp_err_type_t::FILE_NONEXISTENT, name);
|
||||
}
|
||||
|
||||
@@ -53,6 +55,7 @@ pp_err_t preprocess_use_blocks(const vector<token_t *> &tokens,
|
||||
if (lerr.type != lerr_type_t::OK)
|
||||
{
|
||||
VCLEAR(vec_out);
|
||||
vec_out.clear();
|
||||
return pp_err_t(pp_err_type_t::FILE_PARSE_ERROR, name, lerr);
|
||||
}
|
||||
|
||||
@@ -81,19 +84,10 @@ pp_err_t preprocess_const_blocks(const vector<token_t *> &tokens,
|
||||
if (t->type == token_type_t::PP_CONST)
|
||||
{
|
||||
string_view capture;
|
||||
if (t->content == "" && (i == tokens.size() - 1 ||
|
||||
tokens[i + 1]->type != token_type_t::SYMBOL))
|
||||
return ERR(pp_err_t{pp_err_type_t::EXPECTED_NAME});
|
||||
else if (t->content != "")
|
||||
capture = t->content;
|
||||
else
|
||||
capture = tokens[++i]->content;
|
||||
if (i + 1 >= tokens.size() || tokens[i + 1]->type != token_type_t::SYMBOL)
|
||||
return pp_err_type_t::EXPECTED_NAME;
|
||||
|
||||
// Check for brackets
|
||||
auto start = capture.find('(');
|
||||
auto end = capture.find(')');
|
||||
if (start == string::npos || end == string::npos)
|
||||
return ERR(pp_err_t{pp_err_type_t::EXPECTED_NAME});
|
||||
capture = tokens[++i]->content;
|
||||
|
||||
++i;
|
||||
size_t block_start = i, block_end = 0;
|
||||
@@ -105,8 +99,7 @@ pp_err_t preprocess_const_blocks(const vector<token_t *> &tokens,
|
||||
|
||||
block_end = i;
|
||||
|
||||
blocks[capture.substr(start + 1, end - 1)] =
|
||||
const_t{block_start, block_end};
|
||||
blocks[capture] = const_t{block_start, block_end};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -132,6 +125,7 @@ pp_err_t preprocess_const_blocks(const vector<token_t *> &tokens,
|
||||
if (it == blocks.end())
|
||||
{
|
||||
VCLEAR(vec_out);
|
||||
vec_out.clear();
|
||||
return pp_err_t(pp_err_type_t::UNKNOWN_NAME, token);
|
||||
}
|
||||
|
||||
@@ -214,3 +208,11 @@ pp_err_t::pp_err_t(pp_err_type_t err, const token_t *ref)
|
||||
pp_err_t::pp_err_t(pp_err_type_t err, const token_t *ref, lerr_t lerr)
|
||||
: reference{ref}, type{err}, lerr{lerr}
|
||||
{}
|
||||
|
||||
// pp_unit_t::pp_unit_t(const token_t *const token) : resolved{false},
|
||||
// token{token}
|
||||
// {}
|
||||
|
||||
// pp_unit_t::pp_unit_t(std::string_view name, std::vector<pp_unit_t> elements)
|
||||
// : resolved{false}, token{nullptr}, container{name, elements}
|
||||
// {}
|
||||
|
||||
@@ -42,6 +42,21 @@ struct pp_err_t
|
||||
|
||||
std::ostream &operator<<(std::ostream &, pp_err_t &);
|
||||
|
||||
struct pp_unit_t
|
||||
{
|
||||
const token_t *const token;
|
||||
struct
|
||||
{
|
||||
std::string_view name;
|
||||
std::vector<pp_unit_t> elements;
|
||||
} container;
|
||||
|
||||
pp_unit_t(const token_t *const);
|
||||
pp_unit_t(std::string_view, std::vector<pp_unit_t>);
|
||||
};
|
||||
|
||||
std::vector<pp_unit_t> tokens_to_units(const std::vector<token_t *> &);
|
||||
pp_err_t preprocess_use(std::vector<pp_unit_t> &);
|
||||
pp_err_t preprocesser(const std::vector<token_t *> &, std::vector<token_t *> &);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
;; 65 which means that past 20! results are truncated and therefore
|
||||
;; the program produces inaccurate factorials.
|
||||
|
||||
%const(limit) 20 %end
|
||||
%const limit 20 %end
|
||||
|
||||
;; Setup entrypoint
|
||||
global main
|
||||
|
||||
@@ -5,26 +5,26 @@
|
||||
;;; stack version.
|
||||
|
||||
;; Constants
|
||||
%const(limit) 93 %end
|
||||
%const limit 93 %end
|
||||
|
||||
%const(increment_i)
|
||||
%const increment_i
|
||||
push.reg.word 2
|
||||
push.word 1
|
||||
plus.word
|
||||
mov.word 2
|
||||
%end
|
||||
|
||||
%const(print_i)
|
||||
%const print_i
|
||||
push.reg.word 2
|
||||
print.word
|
||||
%end
|
||||
|
||||
%const(print_reg_0)
|
||||
%const print_reg_0
|
||||
push.reg.word 0
|
||||
print.word
|
||||
%end
|
||||
|
||||
%const(print_reg_1)
|
||||
%const print_reg_1
|
||||
push.reg.word 1
|
||||
print.word
|
||||
%end
|
||||
|
||||
152
todo.org
152
todo.org
@@ -1,6 +1,7 @@
|
||||
#+title: TODOs
|
||||
#+author: Aryadev Chavali
|
||||
#+date: 2023-11-02
|
||||
#+startup: noindent
|
||||
|
||||
* TODO Better documentation [0%] :DOC:
|
||||
** TODO Comment coverage [0%]
|
||||
@@ -49,9 +50,158 @@ Languages in the competition:
|
||||
2024-04-14: Chose C++ cos it will require the least effort to rewrite
|
||||
the currently existing codebase while still leveraging some less
|
||||
efficient but incredibly useful features.
|
||||
* TODO Rewrite preprocesser to create a custom unit instead of token streams
|
||||
** Problem
|
||||
A problem that occurs in the preprocessor is token column and line
|
||||
count. Say =a.asm= has ~%use "b.asm"~. The tokens from the =b.asm=
|
||||
file are inserted into =a.asm='s token stream, but the line/column
|
||||
count from there isn't properly set in =a.asm=.
|
||||
|
||||
A naive solution would be to just recount the lines and columns, but
|
||||
this removes information about where those tokens came from. Say an
|
||||
error occurs in some of =b.asm='s code: I would like to be able to
|
||||
report them.
|
||||
|
||||
Therefore, we can no longer just generate new token streams from the
|
||||
preprocesser and should instead look at making more complex
|
||||
abstractions.
|
||||
|
||||
A problem this could also solve is nested errors and recursive
|
||||
constants. Say I have some assembly like so
|
||||
#+begin_src asm
|
||||
%const limit 20 %end
|
||||
%const print-limit
|
||||
...
|
||||
push.byte $limit
|
||||
print.byte
|
||||
...
|
||||
%end
|
||||
#+end_src
|
||||
|
||||
A call to ~print-limit~ under the current system would insert the
|
||||
tokens for print-limit but completely forget about ~push.byte $limit~
|
||||
which would cause a parsing error. (This could be fixed under the
|
||||
current system by allowing reference resolution inside of const
|
||||
blocks, with the conceit that it would be hard to stop infinite recursion)
|
||||
** Language model
|
||||
The model I have in mind is that all constructs in this meta language
|
||||
(the preprocessing language) are either singular tokens or collections
|
||||
of tokens/constructs in a recursive sense. This naturally follows
|
||||
from the fact that a single pass isn't enough to properly parse this
|
||||
language: there must be some recursive nature which forces the
|
||||
language to take multiple passes to completely generate a stream that
|
||||
can be parsed.
|
||||
|
||||
This vague notion can be formalised like so. A preprocessing unit is
|
||||
either a singular token or a named collection of units. The former
|
||||
represents your standard symbols and literals while the later
|
||||
represents ~%const~ and ~%use~ calls where there is a clear name
|
||||
associated to a collection of one or more tokens (in the case of the
|
||||
former it's the constant's name and the latter it's the filename).
|
||||
We'll distinguish this as well.
|
||||
|
||||
#+begin_src text
|
||||
Token = PP_USE | PP_CONST | String(Content) | Symbol(Content) | PUSH(Content) | ...
|
||||
Type = File(String) | Constant(Symbol)
|
||||
Unit = Token | Container(Type . Vector[Unit])
|
||||
#+end_src
|
||||
|
||||
Through this model our initial stream of tokens can be considered
|
||||
units. We can already see that this model may solve our original
|
||||
problem: with named containers it doesn't matter that certain tokens
|
||||
are from different parts of the file or different files as they are
|
||||
distinctly typed from the general set of tokens, with a name which
|
||||
states where they're from.
|
||||
** Processing
|
||||
We need this model to have a notion of "processing" though, otherwise
|
||||
it's quite useless. A processing function is simply a function which
|
||||
takes a unit and returns another unit. We currently have two
|
||||
processing functions we can consider: ~process_const~ and
|
||||
~process_use~.
|
||||
|
||||
~process_use~ takes a vector of tokens and, upon encountering PP_USE
|
||||
accepts the next token (a string) and tokenises the file
|
||||
with that name. Within our model we'd make the stream of tokens
|
||||
created from opening the file a /container/.
|
||||
|
||||
~process_const~ takes a vector of tokens and does two things in an
|
||||
iteration:
|
||||
1) upon encountering PP_CONST accepts the next n tokens till PP_END is
|
||||
encountered, with the first token being a symbol. This is
|
||||
registered in a map of constants (~CONSTS~) where the symbol is the
|
||||
key and the value associated is the n - 1 tokens accepted
|
||||
2) upon encountering a PP_REFERENCE reads the content associated with
|
||||
it (considered a symbol ~S~) and replaces it ~CONSTS[S]~ (if S is
|
||||
in CONSTS).
|
||||
|
||||
One thing to note is that both of these definitions are easily
|
||||
extensible to the general definition of units: if a unit is a
|
||||
container of some kind we can recur through its vector of units to
|
||||
resolve any further "calls". For ~process_const~ it's ~%const~ or
|
||||
~$ref~ while for ~process_use~ it's ~%use~.
|
||||
** History/versioning
|
||||
One additional facet to this model I'd like to add is "history". Each
|
||||
unit is actually a list (or a singly linked tree where each parent has
|
||||
at most one child) of sub-units where the top of the list represents
|
||||
the current version. Each descendant is a previous version of the
|
||||
token.
|
||||
|
||||
Say I do some processing on an element of the unit list =a= (with
|
||||
index =i=) such that it becomes a new "unit", call it =b=. Then we
|
||||
update V by =V[i] = cons(b, a)=. Through this, the lists acts as a
|
||||
history of processing that has occurred on the unit. This provides an
|
||||
ability to trace the path of preprocessing to an eventual conclusion.
|
||||
|
||||
Processing occurs on a unit until it cannot be done further i.e. when
|
||||
there are no more "calls" in the tree to resolve. The history list
|
||||
provides all the versions of a unit till its resolved form.
|
||||
|
||||
To see what a unit with history may look like (where symbols are
|
||||
terminals i.e. completely resolved):
|
||||
+ Container('limit' . [a Container("b" . d e f) c])
|
||||
+ Container('limit' . [a '$b' c])
|
||||
+ Token(PP_REF('$limit'))
|
||||
|
||||
This shows resolution of the unit reference ~$limit~, which in turn
|
||||
leads to the resolution of ~$b~ which is a sub-unit.
|
||||
|
||||
There are two ways of indefinite resolution, one per method of
|
||||
processing. For ~process_use~ it is two files calling ~%use~ on each
|
||||
other and for ~process_const~ it is a ~%const~ calling itself. We can
|
||||
just disallow it through analysis.
|
||||
** Pseudocode
|
||||
#+begin_src text
|
||||
process_use(V: Vector[Unit]) ->
|
||||
[cons((if v is Token(PP_USE) and next(v) is Token(String(S))
|
||||
-> Container(File(S) . tokenise(open(v')))
|
||||
else if v is Container(name . units)
|
||||
-> Container(name . process_use(units))
|
||||
else
|
||||
-> v),
|
||||
v_x)
|
||||
v = v_x[0]
|
||||
for v_x in V]
|
||||
|
||||
CONSTS={}
|
||||
process_const(V: Vector[Unit]) ->
|
||||
[cons((if v is Token(PP_CONST) and next(v) is Token(Symbol(S))
|
||||
do {
|
||||
i := find(Token(PP_END), V[v:])
|
||||
CONSTS[S] = V[next(v):prev(i)]
|
||||
-> Container(Constant(S) . CONSTS[S])
|
||||
}
|
||||
else if v is Token(PP_REF(S))
|
||||
-> CONSTS[S]
|
||||
else if v is Container(name . units)
|
||||
-> Container(name . process_const(units))
|
||||
else
|
||||
-> v)
|
||||
v_x)
|
||||
v = v_x[0]
|
||||
for v_x in V]
|
||||
#+end_src
|
||||
* TODO Introduce error handling in base library :LIB:
|
||||
There is a large variety of TODOs about errors. Let's fix them!
|
||||
|
||||
8 TODOs currently present.
|
||||
* TODO Standard library :ASM:VM:
|
||||
I should start considering this and how a user may use it. Should it
|
||||
|
||||
Reference in New Issue
Block a user