aboutsummaryrefslogtreecommitdiff
path: root/asm/lexer.c
diff options
context:
space:
mode:
authorAryadev Chavali <aryadev@aryadevchavali.com>2024-04-14 02:45:48 +0630
committerAryadev Chavali <aryadev@aryadevchavali.com>2024-04-14 02:45:48 +0630
commit0ebbf3ca751e638a90cf886625992bb028f9b587 (patch)
tree626e1cbddb50f0ec7d5fb8e93c4a3adfa9943349 /asm/lexer.c
parentb7a40f4ab0fc5c0f6b68f24437f479a29e72c9af (diff)
downloadovm-0ebbf3ca751e638a90cf886625992bb028f9b587.tar.gz
ovm-0ebbf3ca751e638a90cf886625992bb028f9b587.tar.bz2
ovm-0ebbf3ca751e638a90cf886625992bb028f9b587.zip
Start writing assembler in C++
Best language to use as it's already compatible with the headers I'm using and can pretty neatly enter the build system while also using the functions I've built for converting to and from bytecode!
Diffstat (limited to 'asm/lexer.c')
-rw-r--r--asm/lexer.c623
1 files changed, 0 insertions, 623 deletions
diff --git a/asm/lexer.c b/asm/lexer.c
deleted file mode 100644
index a4905fb..0000000
--- a/asm/lexer.c
+++ /dev/null
@@ -1,623 +0,0 @@
-/* Copyright (C) 2023 Aryadev Chavali
-
- * You may distribute and modify this code under the terms of the
- * GPLv2 license. You should have received a copy of the GPLv2
- * license with this file. If not, please write to:
- * aryadev@aryadevchavali.com.
-
- * Created: 2023-10-24
- * Author: Aryadev Chavali
- * Description: Lexer for assembly language
- */
-
-#include <assert.h>
-#include <ctype.h>
-#include <stdbool.h>
-#include <stdio.h>
-#include <string.h>
-
-#include <lib/inst.h>
-
-#include "./lexer.h"
-
-const char *token_type_as_cstr(token_type_t type)
-{
- switch (type)
- {
- case TOKEN_PP_USE:
- return "PP_USE";
- case TOKEN_PP_CONST:
- return "PP_CONST";
- case TOKEN_PP_END:
- return "PP_END";
- case TOKEN_PP_REFERENCE:
- return "PP_REFERENCE";
- case TOKEN_GLOBAL:
- return "GLOBAL";
- case TOKEN_STAR:
- return "STAR";
- case TOKEN_LITERAL_STRING:
- return "LITERAL_STRING";
- case TOKEN_LITERAL_NUMBER:
- return "LITERAL_NUMBER";
- case TOKEN_LITERAL_CHAR:
- return "LITERAL_CHAR";
- case TOKEN_NOOP:
- return "NOOP";
- case TOKEN_HALT:
- return "HALT";
- case TOKEN_PUSH:
- return "PUSH";
- case TOKEN_POP:
- return "POP";
- case TOKEN_PUSH_REG:
- return "PUSH_REG";
- case TOKEN_MOV:
- return "MOV";
- case TOKEN_DUP:
- return "DUP";
- case TOKEN_MALLOC:
- return "MALLOC";
- case TOKEN_MALLOC_STACK:
- return "MALLOC_STACK";
- case TOKEN_MSET:
- return "MSET";
- case TOKEN_MSET_STACK:
- return "MSET_STACK";
- case TOKEN_MGET:
- return "MGET";
- case TOKEN_MGET_STACK:
- return "MGET_STACK";
- case TOKEN_MDELETE:
- return "MDELETE";
- case TOKEN_MSIZE:
- return "MSIZE";
- case TOKEN_NOT:
- return "NOT";
- case TOKEN_OR:
- return "OR";
- case TOKEN_AND:
- return "AND";
- case TOKEN_XOR:
- return "XOR";
- case TOKEN_EQ:
- return "EQ";
- case TOKEN_LT:
- return "LT";
- case TOKEN_LTE:
- return "LTE";
- case TOKEN_GT:
- return "GT";
- case TOKEN_GTE:
- return "GTE";
- case TOKEN_PLUS:
- return "PLUS";
- case TOKEN_SUB:
- return "SUB";
- case TOKEN_MULT:
- return "MULT";
- case TOKEN_PRINT:
- return "PRINT";
- case TOKEN_JUMP_ABS:
- return "JUMP_ABS";
- case TOKEN_JUMP_STACK:
- return "JUMP_STACK";
- case TOKEN_JUMP_IF:
- return "JUMP_IF";
- case TOKEN_CALL:
- return "CALL";
- case TOKEN_CALL_STACK:
- return "CALL_STACK";
- case TOKEN_RET:
- return "RET";
- case TOKEN_SYMBOL:
- return "SYMBOL";
- }
- return "";
-}
-
-const char *lerr_as_cstr(lerr_t lerr)
-{
- switch (lerr)
- {
- case LERR_OK:
- return "OK";
- case LERR_INVALID_CHAR_LITERAL:
- return "INVALID_CHAR_LITERAL";
- case LERR_INVALID_PREPROCESSOR_DIRECTIVE:
- return "INVALID_PREPROCESSOR_DIRECTIVE";
- }
- return "";
-}
-
-token_t token_copy(token_t t)
-{
- token_t new = t;
- new.str = malloc(t.str_size + 1);
- memcpy(new.str, t.str, t.str_size);
- new.str[t.str_size] = '\0';
- return new;
-}
-
-size_t space_left(buffer_t *buffer)
-{
- if (buffer->available == buffer->used)
- return 0;
- return buffer->available - 1 - buffer->used;
-}
-
-char uppercase(char c)
-{
- if (c >= 'a' && c <= 'z')
- return (c - 'a') + 'A';
- return c;
-}
-
-bool is_symbol(char c)
-{
- return isalpha(c) || isdigit(c) || c == '-' || c == '_' || c == '.' ||
- c == ':' || c == '(' || c == ')' || c == '%' || c == '$';
-}
-
-bool is_valid_hex_char(char c)
-{
- return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') ||
- (c >= 'A' && c <= 'F');
-}
-
-lerr_t tokenise_symbol(buffer_t *buffer, size_t *column, token_t *token)
-{
- static_assert(NUMBER_OF_OPCODES == 98, "tokenise_buffer: Out of date!");
-
- size_t sym_size = 0;
- for (; sym_size < space_left(buffer) &&
- is_symbol(buffer->data[buffer->used + sym_size]);
- ++sym_size)
- buffer->data[buffer->used + sym_size] =
- uppercase(buffer->data[buffer->used + sym_size]);
-
- token_t ret = {0};
- char *opcode = (char *)buffer->data + buffer->used;
-
- bool is_opcode = true;
- token_type_t type = 0;
- size_t offset = 0;
-
- if (sym_size > 1 && strncmp(opcode, "%", 1) == 0)
- {
- // Some preprocessing directive
- if (sym_size > 6 && strncmp(opcode + 1, "CONST", 5) == 0)
- {
- type = TOKEN_PP_CONST;
- offset = 6;
- }
- else if (sym_size == 4 && strncmp(opcode + 1, "USE", 3) == 0)
- {
- type = TOKEN_PP_USE;
- offset = 4;
- }
- else if (sym_size == 4 && strncmp(opcode + 1, "END", 3) == 0)
- {
- type = TOKEN_PP_END;
- offset = 4;
- }
- else
- return LERR_INVALID_PREPROCESSOR_DIRECTIVE;
- }
- else if (sym_size > 1 && strncmp(opcode, "$", 1) == 0)
- {
- // A reference to a preprocessing constant
- offset = 1;
- type = TOKEN_PP_REFERENCE;
- }
- else if (sym_size == 4 && strncmp(opcode, "NOOP", 4) == 0)
- {
- offset = 4;
- type = TOKEN_NOOP;
- }
- else if (sym_size == 4 && strncmp(opcode, "HALT", 4) == 0)
- {
- offset = 4;
- type = TOKEN_HALT;
- }
- else if (sym_size > 9 && strncmp(opcode, "PUSH.REG.", 9) == 0)
- {
- offset = 9;
- type = TOKEN_PUSH_REG;
- }
- else if (sym_size > 5 && strncmp(opcode, "PUSH.", 5) == 0)
- {
- offset = 5;
- type = TOKEN_PUSH;
- }
- else if (sym_size > 4 && strncmp(opcode, "POP.", 4) == 0)
- {
- offset = 4;
- type = TOKEN_POP;
- }
- else if (sym_size > 4 && strncmp(opcode, "MOV.", 4) == 0)
- {
- offset = 4;
- type = TOKEN_MOV;
- }
- else if (sym_size > 4 && strncmp(opcode, "DUP.", 4) == 0)
- {
- offset = 4;
- type = TOKEN_DUP;
- }
- else if (sym_size > 13 && strncmp(opcode, "MALLOC.STACK.", 13) == 0)
- {
- offset = 13;
- type = TOKEN_MALLOC_STACK;
- }
- else if (sym_size > 7 && strncmp(opcode, "MALLOC.", 7) == 0)
- {
- offset = 7;
- type = TOKEN_MALLOC;
- }
- else if (sym_size > 11 && strncmp(opcode, "MSET.STACK.", 11) == 0)
- {
- offset = 11;
- type = TOKEN_MSET_STACK;
- }
- else if (sym_size > 5 && strncmp(opcode, "MSET.", 5) == 0)
- {
- offset = 5;
- type = TOKEN_MSET;
- }
- else if (sym_size > 11 && strncmp(opcode, "MGET.STACK.", 11) == 0)
- {
- offset = 11;
- type = TOKEN_MGET_STACK;
- }
- else if (sym_size > 5 && strncmp(opcode, "MGET.", 5) == 0)
- {
- offset = 5;
- type = TOKEN_MGET;
- }
- else if (sym_size == 7 && strncmp(opcode, "MDELETE", 7) == 0)
- {
- offset = 7;
- type = TOKEN_MDELETE;
- }
- else if (sym_size == 5 && strncmp(opcode, "MSIZE", 5) == 0)
- {
- offset = 5;
- type = TOKEN_MSIZE;
- }
- else if (sym_size > 4 && strncmp(opcode, "NOT.", 4) == 0)
- {
- offset = 4;
- type = TOKEN_NOT;
- }
- else if (sym_size > 3 && strncmp(opcode, "OR.", 3) == 0)
- {
- offset = 3;
- type = TOKEN_OR;
- }
- else if (sym_size > 4 && strncmp(opcode, "AND.", 4) == 0)
- {
- offset = 4;
- type = TOKEN_AND;
- }
- else if (sym_size > 4 && strncmp(opcode, "XOR.", 4) == 0)
- {
- offset = 4;
- type = TOKEN_XOR;
- }
- else if (sym_size >= 3 && strncmp(opcode, "EQ.", 3) == 0)
- {
- offset = 3;
- type = TOKEN_EQ;
- }
- else if (sym_size > 4 && strncmp(opcode, "LTE.", 4) == 0)
- {
- offset = 4;
- type = TOKEN_LTE;
- }
- else if (sym_size > 3 && strncmp(opcode, "LT.", 3) == 0)
- {
- offset = 3;
- type = TOKEN_LT;
- }
- else if (sym_size > 4 && strncmp(opcode, "GTE.", 4) == 0)
- {
- offset = 4;
- type = TOKEN_GTE;
- }
- else if (sym_size > 3 && strncmp(opcode, "GT.", 3) == 0)
- {
- offset = 3;
- type = TOKEN_GT;
- }
- else if (sym_size > 4 && strncmp(opcode, "SUB.", 4) == 0)
- {
- offset = 4;
- type = TOKEN_SUB;
- }
- else if (sym_size > 5 && strncmp(opcode, "PLUS.", 5) == 0)
- {
- offset = 5;
- type = TOKEN_PLUS;
- }
- else if (sym_size > 5 && strncmp(opcode, "MULT.", 5) == 0)
- {
- offset = 5;
- type = TOKEN_MULT;
- }
- else if (sym_size > 6 && strncmp(opcode, "PRINT.", 6) == 0)
- {
- offset = 6;
- type = TOKEN_PRINT;
- }
- else if (sym_size == 8 && strncmp(opcode, "JUMP.ABS", 8) == 0)
- {
- offset = 8;
- type = TOKEN_JUMP_ABS;
- }
- else if (sym_size == 10 && strncmp(opcode, "JUMP.STACK", 10) == 0)
- {
- offset = 10;
- type = TOKEN_JUMP_STACK;
- }
- else if (sym_size > 8 && strncmp(opcode, "JUMP.IF.", 8) == 0)
- {
- offset = 8;
- type = TOKEN_JUMP_IF;
- }
- else if (sym_size == 10 && strncmp(opcode, "CALL.STACK", 10) == 0)
- {
- offset = 10;
- type = TOKEN_CALL_STACK;
- }
- else if (sym_size == 4 && strncmp(opcode, "CALL", 4) == 0)
- {
- offset = 4;
- type = TOKEN_CALL;
- }
- else if (sym_size == 3 && strncmp(opcode, "RET", 3) == 0)
- {
- offset = 3;
- type = TOKEN_RET;
- }
- else if (sym_size == 6 && strncmp(opcode, "GLOBAL", 6) == 0)
- {
- offset = 6;
- type = TOKEN_GLOBAL;
- }
- else
- is_opcode = false;
-
- if (!is_opcode)
- {
- // Just a symbol, so no further manipulation
- char *sym = malloc(sym_size + 1);
- memcpy(sym, opcode, sym_size);
- sym[sym_size] = '\0';
- ret = (token_t){.type = TOKEN_SYMBOL,
- .str = sym,
- .column = *column,
- .str_size = sym_size};
- }
- else
- {
- ret.type = type;
- ret.column = *column;
- if (offset == sym_size)
- {
- // There's no more to the string
- ret.str = malloc(1);
- ret.str[0] = '\0';
- }
- else
- {
- // t.str is the remaining part of the string after the
- // opcode
- ret.str = calloc(sym_size - offset + 1, 1);
- memcpy(ret.str, opcode + offset, sym_size - offset);
- ret.str[sym_size - offset] = '\0';
- }
- ret.str_size = sym_size - offset;
- }
- *column += sym_size - 1;
- buffer->used += sym_size;
- *token = ret;
- return LERR_OK;
-}
-
-token_t tokenise_number_literal(buffer_t *buffer, size_t *column)
-{
- token_t token = {
- .type = TOKEN_LITERAL_NUMBER, .str_size = 0, .column = *column};
- if (buffer->data[buffer->used] == '-')
- ++token.str_size;
- for (; token.str_size < space_left(buffer) &&
- isdigit(buffer->data[buffer->used + token.str_size]);
- ++token.str_size)
- continue;
- token.str = calloc(token.str_size + 1, 1);
- memcpy(token.str, buffer->data + buffer->used, token.str_size);
- token.str[token.str_size] = '\0';
- buffer->used += token.str_size;
- *column += token.str_size;
- return token;
-}
-
-token_t tokenise_hex_literal(buffer_t *buffer, size_t *column)
-{
- // For the x part of the literal
- ++buffer->used;
- token_t token = {
- .type = TOKEN_LITERAL_NUMBER, .str_size = 0, .column = *column};
- for (; token.str_size < space_left(buffer) &&
- is_valid_hex_char(buffer->data[buffer->used + token.str_size]);
- ++token.str_size)
- continue;
- // Setup a proper C hex literal
- token.str = calloc(token.str_size + 3, 1);
- token.str[0] = '0';
- token.str[1] = 'x';
- memcpy(token.str + 2, buffer->data + buffer->used, token.str_size);
- token.str[token.str_size + 2] = '\0';
- buffer->used += token.str_size;
- *column += token.str_size;
-
- // Setup the first two characters
- token.str_size += 2;
- return token;
-}
-
-token_t tokenise_char_literal(buffer_t *buffer, size_t *column)
-{
- token_t token = {
- .type = TOKEN_LITERAL_CHAR, .str_size = 1, .column = *column};
- token.str = calloc(2, 1);
- token.str[0] = buffer->data[buffer->used + 1];
- token.str[1] = '\0';
- buffer->used += 3;
- *column += 3;
- return token;
-}
-
-token_t tokenise_string_literal(buffer_t *buffer, size_t *column)
-{
- ++buffer->used;
- size_t string_size;
- for (string_size = 0; string_size + buffer->used < buffer->available &&
- buffer->data[buffer->used + string_size] != '\"';
- ++string_size)
- continue;
- token_t t = {.type = TOKEN_LITERAL_STRING,
- .column = *column,
- .str = malloc(string_size + 1),
- .str_size = string_size};
- memcpy(t.str, buffer->data + buffer->used, string_size);
- t.str[string_size] = '\0';
- *column += string_size + 1;
- buffer->used += string_size + 1;
- return t;
-}
-
-lerr_t tokenise_buffer(buffer_t *buffer, token_stream_t *tokens_ptr)
-{
- size_t column = 0, line = 1;
- token_stream_t tokens = {0};
- darr_init(&tokens, sizeof(token_t));
- while (space_left(buffer) != 0)
- {
- bool is_token = true;
- token_t t = {0};
- char c = buffer->data[buffer->used];
- if (isspace(c) || c == '\0')
- {
- // Clean whitespace
- for (; space_left(buffer) > 0 && (isspace(c) || c == '\0');
- ++buffer->used, c = buffer->data[buffer->used])
- {
- ++column;
- if (c == '\n')
- {
- column = 0;
- ++line;
- }
- }
- ++column;
- is_token = false;
- }
- else if (c == ';')
- {
- // Start lexing at next line
- for (; space_left(buffer) > 0 && c != '\n';
- ++buffer->used, c = buffer->data[buffer->used])
- continue;
- column = 0;
- ++line;
- ++buffer->used;
- is_token = false;
- }
- else if (c == '*')
- {
- t = (token_t){.type = TOKEN_STAR,
- .column = column,
- .str = malloc(1),
- .str_size = 1};
- t.str[0] = '\0';
- ++buffer->used;
- }
- else if (c == '\"')
- t = tokenise_string_literal(buffer, &column);
- else if (isdigit(c) || (space_left(buffer) > 1 && c == '-' &&
- isdigit(buffer->data[buffer->used + 1])))
- t = tokenise_number_literal(buffer, &column);
- else if (c == 'x' && space_left(buffer) > 1 &&
- is_valid_hex_char(buffer->data[buffer->used + 1]))
- t = tokenise_hex_literal(buffer, &column);
- else if (is_symbol(c))
- {
- lerr_t lerr = tokenise_symbol(buffer, &column, &t);
- if (lerr)
- {
- free(tokens.data);
- return lerr;
- }
- }
- else if (c == '\'')
- {
- if (space_left(buffer) < 2)
- {
- free(tokens.data);
- return LERR_INVALID_CHAR_LITERAL;
- }
- else if (buffer->data[buffer->used + 1] == '\\')
- {
- char escape = '\0';
- if (space_left(buffer) < 3 || buffer->data[buffer->used + 3] != '\'')
- {
- free(tokens.data);
- return LERR_INVALID_CHAR_LITERAL;
- }
- switch (buffer->data[buffer->used + 2])
- {
- case 'n':
- escape = '\n';
- break;
- case 't':
- escape = '\t';
- break;
- case 'r':
- escape = '\r';
- break;
- case '\\':
- escape = '\\';
- break;
- default:
- column += 2;
- free(tokens.data);
- return LERR_INVALID_CHAR_LITERAL;
- break;
- }
-
- t = (token_t){.type = TOKEN_LITERAL_CHAR,
- .str = malloc(2),
- .str_size = 1,
- .column = column};
- column += 2;
- buffer->used += 4;
- t.str[0] = escape;
- t.str[1] = '\0';
- }
- else
- t = tokenise_char_literal(buffer, &column);
- }
-
- if (is_token)
- {
- t.line = line;
- darr_append_bytes(&tokens, (byte *)&t, sizeof(t));
- }
- }
- tokens.available = tokens.used / sizeof(token_t);
- tokens.used = 0;
- *tokens_ptr = tokens;
- return LERR_OK;
-}