diff options
Diffstat (limited to 'asm/lexer.c')
-rw-r--r-- | asm/lexer.c | 623 |
1 files changed, 0 insertions, 623 deletions
diff --git a/asm/lexer.c b/asm/lexer.c deleted file mode 100644 index a4905fb..0000000 --- a/asm/lexer.c +++ /dev/null @@ -1,623 +0,0 @@ -/* Copyright (C) 2023 Aryadev Chavali - - * You may distribute and modify this code under the terms of the - * GPLv2 license. You should have received a copy of the GPLv2 - * license with this file. If not, please write to: - * aryadev@aryadevchavali.com. - - * Created: 2023-10-24 - * Author: Aryadev Chavali - * Description: Lexer for assembly language - */ - -#include <assert.h> -#include <ctype.h> -#include <stdbool.h> -#include <stdio.h> -#include <string.h> - -#include <lib/inst.h> - -#include "./lexer.h" - -const char *token_type_as_cstr(token_type_t type) -{ - switch (type) - { - case TOKEN_PP_USE: - return "PP_USE"; - case TOKEN_PP_CONST: - return "PP_CONST"; - case TOKEN_PP_END: - return "PP_END"; - case TOKEN_PP_REFERENCE: - return "PP_REFERENCE"; - case TOKEN_GLOBAL: - return "GLOBAL"; - case TOKEN_STAR: - return "STAR"; - case TOKEN_LITERAL_STRING: - return "LITERAL_STRING"; - case TOKEN_LITERAL_NUMBER: - return "LITERAL_NUMBER"; - case TOKEN_LITERAL_CHAR: - return "LITERAL_CHAR"; - case TOKEN_NOOP: - return "NOOP"; - case TOKEN_HALT: - return "HALT"; - case TOKEN_PUSH: - return "PUSH"; - case TOKEN_POP: - return "POP"; - case TOKEN_PUSH_REG: - return "PUSH_REG"; - case TOKEN_MOV: - return "MOV"; - case TOKEN_DUP: - return "DUP"; - case TOKEN_MALLOC: - return "MALLOC"; - case TOKEN_MALLOC_STACK: - return "MALLOC_STACK"; - case TOKEN_MSET: - return "MSET"; - case TOKEN_MSET_STACK: - return "MSET_STACK"; - case TOKEN_MGET: - return "MGET"; - case TOKEN_MGET_STACK: - return "MGET_STACK"; - case TOKEN_MDELETE: - return "MDELETE"; - case TOKEN_MSIZE: - return "MSIZE"; - case TOKEN_NOT: - return "NOT"; - case TOKEN_OR: - return "OR"; - case TOKEN_AND: - return "AND"; - case TOKEN_XOR: - return "XOR"; - case TOKEN_EQ: - return "EQ"; - case TOKEN_LT: - return "LT"; - case TOKEN_LTE: - return "LTE"; - case TOKEN_GT: - return "GT"; - case TOKEN_GTE: - return "GTE"; - case TOKEN_PLUS: - return "PLUS"; - case TOKEN_SUB: - return "SUB"; - case TOKEN_MULT: - return "MULT"; - case TOKEN_PRINT: - return "PRINT"; - case TOKEN_JUMP_ABS: - return "JUMP_ABS"; - case TOKEN_JUMP_STACK: - return "JUMP_STACK"; - case TOKEN_JUMP_IF: - return "JUMP_IF"; - case TOKEN_CALL: - return "CALL"; - case TOKEN_CALL_STACK: - return "CALL_STACK"; - case TOKEN_RET: - return "RET"; - case TOKEN_SYMBOL: - return "SYMBOL"; - } - return ""; -} - -const char *lerr_as_cstr(lerr_t lerr) -{ - switch (lerr) - { - case LERR_OK: - return "OK"; - case LERR_INVALID_CHAR_LITERAL: - return "INVALID_CHAR_LITERAL"; - case LERR_INVALID_PREPROCESSOR_DIRECTIVE: - return "INVALID_PREPROCESSOR_DIRECTIVE"; - } - return ""; -} - -token_t token_copy(token_t t) -{ - token_t new = t; - new.str = malloc(t.str_size + 1); - memcpy(new.str, t.str, t.str_size); - new.str[t.str_size] = '\0'; - return new; -} - -size_t space_left(buffer_t *buffer) -{ - if (buffer->available == buffer->used) - return 0; - return buffer->available - 1 - buffer->used; -} - -char uppercase(char c) -{ - if (c >= 'a' && c <= 'z') - return (c - 'a') + 'A'; - return c; -} - -bool is_symbol(char c) -{ - return isalpha(c) || isdigit(c) || c == '-' || c == '_' || c == '.' || - c == ':' || c == '(' || c == ')' || c == '%' || c == '$'; -} - -bool is_valid_hex_char(char c) -{ - return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || - (c >= 'A' && c <= 'F'); -} - -lerr_t tokenise_symbol(buffer_t *buffer, size_t *column, token_t *token) -{ - static_assert(NUMBER_OF_OPCODES == 98, "tokenise_buffer: Out of date!"); - - size_t sym_size = 0; - for (; sym_size < space_left(buffer) && - is_symbol(buffer->data[buffer->used + sym_size]); - ++sym_size) - buffer->data[buffer->used + sym_size] = - uppercase(buffer->data[buffer->used + sym_size]); - - token_t ret = {0}; - char *opcode = (char *)buffer->data + buffer->used; - - bool is_opcode = true; - token_type_t type = 0; - size_t offset = 0; - - if (sym_size > 1 && strncmp(opcode, "%", 1) == 0) - { - // Some preprocessing directive - if (sym_size > 6 && strncmp(opcode + 1, "CONST", 5) == 0) - { - type = TOKEN_PP_CONST; - offset = 6; - } - else if (sym_size == 4 && strncmp(opcode + 1, "USE", 3) == 0) - { - type = TOKEN_PP_USE; - offset = 4; - } - else if (sym_size == 4 && strncmp(opcode + 1, "END", 3) == 0) - { - type = TOKEN_PP_END; - offset = 4; - } - else - return LERR_INVALID_PREPROCESSOR_DIRECTIVE; - } - else if (sym_size > 1 && strncmp(opcode, "$", 1) == 0) - { - // A reference to a preprocessing constant - offset = 1; - type = TOKEN_PP_REFERENCE; - } - else if (sym_size == 4 && strncmp(opcode, "NOOP", 4) == 0) - { - offset = 4; - type = TOKEN_NOOP; - } - else if (sym_size == 4 && strncmp(opcode, "HALT", 4) == 0) - { - offset = 4; - type = TOKEN_HALT; - } - else if (sym_size > 9 && strncmp(opcode, "PUSH.REG.", 9) == 0) - { - offset = 9; - type = TOKEN_PUSH_REG; - } - else if (sym_size > 5 && strncmp(opcode, "PUSH.", 5) == 0) - { - offset = 5; - type = TOKEN_PUSH; - } - else if (sym_size > 4 && strncmp(opcode, "POP.", 4) == 0) - { - offset = 4; - type = TOKEN_POP; - } - else if (sym_size > 4 && strncmp(opcode, "MOV.", 4) == 0) - { - offset = 4; - type = TOKEN_MOV; - } - else if (sym_size > 4 && strncmp(opcode, "DUP.", 4) == 0) - { - offset = 4; - type = TOKEN_DUP; - } - else if (sym_size > 13 && strncmp(opcode, "MALLOC.STACK.", 13) == 0) - { - offset = 13; - type = TOKEN_MALLOC_STACK; - } - else if (sym_size > 7 && strncmp(opcode, "MALLOC.", 7) == 0) - { - offset = 7; - type = TOKEN_MALLOC; - } - else if (sym_size > 11 && strncmp(opcode, "MSET.STACK.", 11) == 0) - { - offset = 11; - type = TOKEN_MSET_STACK; - } - else if (sym_size > 5 && strncmp(opcode, "MSET.", 5) == 0) - { - offset = 5; - type = TOKEN_MSET; - } - else if (sym_size > 11 && strncmp(opcode, "MGET.STACK.", 11) == 0) - { - offset = 11; - type = TOKEN_MGET_STACK; - } - else if (sym_size > 5 && strncmp(opcode, "MGET.", 5) == 0) - { - offset = 5; - type = TOKEN_MGET; - } - else if (sym_size == 7 && strncmp(opcode, "MDELETE", 7) == 0) - { - offset = 7; - type = TOKEN_MDELETE; - } - else if (sym_size == 5 && strncmp(opcode, "MSIZE", 5) == 0) - { - offset = 5; - type = TOKEN_MSIZE; - } - else if (sym_size > 4 && strncmp(opcode, "NOT.", 4) == 0) - { - offset = 4; - type = TOKEN_NOT; - } - else if (sym_size > 3 && strncmp(opcode, "OR.", 3) == 0) - { - offset = 3; - type = TOKEN_OR; - } - else if (sym_size > 4 && strncmp(opcode, "AND.", 4) == 0) - { - offset = 4; - type = TOKEN_AND; - } - else if (sym_size > 4 && strncmp(opcode, "XOR.", 4) == 0) - { - offset = 4; - type = TOKEN_XOR; - } - else if (sym_size >= 3 && strncmp(opcode, "EQ.", 3) == 0) - { - offset = 3; - type = TOKEN_EQ; - } - else if (sym_size > 4 && strncmp(opcode, "LTE.", 4) == 0) - { - offset = 4; - type = TOKEN_LTE; - } - else if (sym_size > 3 && strncmp(opcode, "LT.", 3) == 0) - { - offset = 3; - type = TOKEN_LT; - } - else if (sym_size > 4 && strncmp(opcode, "GTE.", 4) == 0) - { - offset = 4; - type = TOKEN_GTE; - } - else if (sym_size > 3 && strncmp(opcode, "GT.", 3) == 0) - { - offset = 3; - type = TOKEN_GT; - } - else if (sym_size > 4 && strncmp(opcode, "SUB.", 4) == 0) - { - offset = 4; - type = TOKEN_SUB; - } - else if (sym_size > 5 && strncmp(opcode, "PLUS.", 5) == 0) - { - offset = 5; - type = TOKEN_PLUS; - } - else if (sym_size > 5 && strncmp(opcode, "MULT.", 5) == 0) - { - offset = 5; - type = TOKEN_MULT; - } - else if (sym_size > 6 && strncmp(opcode, "PRINT.", 6) == 0) - { - offset = 6; - type = TOKEN_PRINT; - } - else if (sym_size == 8 && strncmp(opcode, "JUMP.ABS", 8) == 0) - { - offset = 8; - type = TOKEN_JUMP_ABS; - } - else if (sym_size == 10 && strncmp(opcode, "JUMP.STACK", 10) == 0) - { - offset = 10; - type = TOKEN_JUMP_STACK; - } - else if (sym_size > 8 && strncmp(opcode, "JUMP.IF.", 8) == 0) - { - offset = 8; - type = TOKEN_JUMP_IF; - } - else if (sym_size == 10 && strncmp(opcode, "CALL.STACK", 10) == 0) - { - offset = 10; - type = TOKEN_CALL_STACK; - } - else if (sym_size == 4 && strncmp(opcode, "CALL", 4) == 0) - { - offset = 4; - type = TOKEN_CALL; - } - else if (sym_size == 3 && strncmp(opcode, "RET", 3) == 0) - { - offset = 3; - type = TOKEN_RET; - } - else if (sym_size == 6 && strncmp(opcode, "GLOBAL", 6) == 0) - { - offset = 6; - type = TOKEN_GLOBAL; - } - else - is_opcode = false; - - if (!is_opcode) - { - // Just a symbol, so no further manipulation - char *sym = malloc(sym_size + 1); - memcpy(sym, opcode, sym_size); - sym[sym_size] = '\0'; - ret = (token_t){.type = TOKEN_SYMBOL, - .str = sym, - .column = *column, - .str_size = sym_size}; - } - else - { - ret.type = type; - ret.column = *column; - if (offset == sym_size) - { - // There's no more to the string - ret.str = malloc(1); - ret.str[0] = '\0'; - } - else - { - // t.str is the remaining part of the string after the - // opcode - ret.str = calloc(sym_size - offset + 1, 1); - memcpy(ret.str, opcode + offset, sym_size - offset); - ret.str[sym_size - offset] = '\0'; - } - ret.str_size = sym_size - offset; - } - *column += sym_size - 1; - buffer->used += sym_size; - *token = ret; - return LERR_OK; -} - -token_t tokenise_number_literal(buffer_t *buffer, size_t *column) -{ - token_t token = { - .type = TOKEN_LITERAL_NUMBER, .str_size = 0, .column = *column}; - if (buffer->data[buffer->used] == '-') - ++token.str_size; - for (; token.str_size < space_left(buffer) && - isdigit(buffer->data[buffer->used + token.str_size]); - ++token.str_size) - continue; - token.str = calloc(token.str_size + 1, 1); - memcpy(token.str, buffer->data + buffer->used, token.str_size); - token.str[token.str_size] = '\0'; - buffer->used += token.str_size; - *column += token.str_size; - return token; -} - -token_t tokenise_hex_literal(buffer_t *buffer, size_t *column) -{ - // For the x part of the literal - ++buffer->used; - token_t token = { - .type = TOKEN_LITERAL_NUMBER, .str_size = 0, .column = *column}; - for (; token.str_size < space_left(buffer) && - is_valid_hex_char(buffer->data[buffer->used + token.str_size]); - ++token.str_size) - continue; - // Setup a proper C hex literal - token.str = calloc(token.str_size + 3, 1); - token.str[0] = '0'; - token.str[1] = 'x'; - memcpy(token.str + 2, buffer->data + buffer->used, token.str_size); - token.str[token.str_size + 2] = '\0'; - buffer->used += token.str_size; - *column += token.str_size; - - // Setup the first two characters - token.str_size += 2; - return token; -} - -token_t tokenise_char_literal(buffer_t *buffer, size_t *column) -{ - token_t token = { - .type = TOKEN_LITERAL_CHAR, .str_size = 1, .column = *column}; - token.str = calloc(2, 1); - token.str[0] = buffer->data[buffer->used + 1]; - token.str[1] = '\0'; - buffer->used += 3; - *column += 3; - return token; -} - -token_t tokenise_string_literal(buffer_t *buffer, size_t *column) -{ - ++buffer->used; - size_t string_size; - for (string_size = 0; string_size + buffer->used < buffer->available && - buffer->data[buffer->used + string_size] != '\"'; - ++string_size) - continue; - token_t t = {.type = TOKEN_LITERAL_STRING, - .column = *column, - .str = malloc(string_size + 1), - .str_size = string_size}; - memcpy(t.str, buffer->data + buffer->used, string_size); - t.str[string_size] = '\0'; - *column += string_size + 1; - buffer->used += string_size + 1; - return t; -} - -lerr_t tokenise_buffer(buffer_t *buffer, token_stream_t *tokens_ptr) -{ - size_t column = 0, line = 1; - token_stream_t tokens = {0}; - darr_init(&tokens, sizeof(token_t)); - while (space_left(buffer) != 0) - { - bool is_token = true; - token_t t = {0}; - char c = buffer->data[buffer->used]; - if (isspace(c) || c == '\0') - { - // Clean whitespace - for (; space_left(buffer) > 0 && (isspace(c) || c == '\0'); - ++buffer->used, c = buffer->data[buffer->used]) - { - ++column; - if (c == '\n') - { - column = 0; - ++line; - } - } - ++column; - is_token = false; - } - else if (c == ';') - { - // Start lexing at next line - for (; space_left(buffer) > 0 && c != '\n'; - ++buffer->used, c = buffer->data[buffer->used]) - continue; - column = 0; - ++line; - ++buffer->used; - is_token = false; - } - else if (c == '*') - { - t = (token_t){.type = TOKEN_STAR, - .column = column, - .str = malloc(1), - .str_size = 1}; - t.str[0] = '\0'; - ++buffer->used; - } - else if (c == '\"') - t = tokenise_string_literal(buffer, &column); - else if (isdigit(c) || (space_left(buffer) > 1 && c == '-' && - isdigit(buffer->data[buffer->used + 1]))) - t = tokenise_number_literal(buffer, &column); - else if (c == 'x' && space_left(buffer) > 1 && - is_valid_hex_char(buffer->data[buffer->used + 1])) - t = tokenise_hex_literal(buffer, &column); - else if (is_symbol(c)) - { - lerr_t lerr = tokenise_symbol(buffer, &column, &t); - if (lerr) - { - free(tokens.data); - return lerr; - } - } - else if (c == '\'') - { - if (space_left(buffer) < 2) - { - free(tokens.data); - return LERR_INVALID_CHAR_LITERAL; - } - else if (buffer->data[buffer->used + 1] == '\\') - { - char escape = '\0'; - if (space_left(buffer) < 3 || buffer->data[buffer->used + 3] != '\'') - { - free(tokens.data); - return LERR_INVALID_CHAR_LITERAL; - } - switch (buffer->data[buffer->used + 2]) - { - case 'n': - escape = '\n'; - break; - case 't': - escape = '\t'; - break; - case 'r': - escape = '\r'; - break; - case '\\': - escape = '\\'; - break; - default: - column += 2; - free(tokens.data); - return LERR_INVALID_CHAR_LITERAL; - break; - } - - t = (token_t){.type = TOKEN_LITERAL_CHAR, - .str = malloc(2), - .str_size = 1, - .column = column}; - column += 2; - buffer->used += 4; - t.str[0] = escape; - t.str[1] = '\0'; - } - else - t = tokenise_char_literal(buffer, &column); - } - - if (is_token) - { - t.line = line; - darr_append_bytes(&tokens, (byte *)&t, sizeof(t)); - } - } - tokens.available = tokens.used / sizeof(token_t); - tokens.used = 0; - *tokens_ptr = tokens; - return LERR_OK; -} |