diff options
-rw-r--r-- | asm/lexer.c | 378 | ||||
-rw-r--r-- | asm/lexer.h | 20 |
2 files changed, 371 insertions, 27 deletions
diff --git a/asm/lexer.c b/asm/lexer.c index a02c8e1..a4d9f5a 100644 --- a/asm/lexer.c +++ b/asm/lexer.c @@ -10,11 +10,14 @@ * Description: Lexer for assembly language */ +#include <assert.h> #include <ctype.h> #include <stdbool.h> #include <stdio.h> #include <string.h> +#include <lib/inst.h> + #include "./lexer.h" const char *token_type_as_cstr(token_type_t type) @@ -25,6 +28,46 @@ const char *token_type_as_cstr(token_type_t type) return "LITERAL_NUMBER"; case TOKEN_LITERAL_CHAR: return "LITERAL_CHAR"; + case TOKEN_NOOP: + return "NOOP"; + case TOKEN_HALT: + return "HALT"; + case TOKEN_PUSH: + return "PUSH"; + case TOKEN_POP: + return "POP"; + case TOKEN_PUSH_REG: + return "PUSH_REG"; + case TOKEN_MOV: + return "MOV"; + case TOKEN_DUP: + return "DUP"; + case TOKEN_NOT: + return "NOT"; + case TOKEN_OR: + return "OR"; + case TOKEN_AND: + return "AND"; + case TOKEN_XOR: + return "XOR"; + case TOKEN_EQ: + return "EQ"; + case TOKEN_LT: + return "LT"; + case TOKEN_LTE: + return "LTE"; + case TOKEN_GT: + return "GT"; + case TOKEN_GTE: + return "GTE"; + case TOKEN_PLUS: + return "PLUS"; + case TOKEN_PRINT: + return "PRINT"; + case TOKEN_JUMP: + return "JUMP"; + case TOKEN_JUMP_IF: + return "JUMP_IF"; case TOKEN_SYMBOL: return "SYMBOL"; } @@ -52,32 +95,179 @@ size_t space_left(buffer_t *buffer) return buffer->available - 1 - buffer->used; } +char uppercase(char c) +{ + if (c >= 'a' && c <= 'z') + return (c - 'a') + 'A'; + return c; +} + bool is_symbol(char c) { return isalpha(c) || c == '-' || c == '_' || c == '.'; } -char uppercase(char c) +bool is_valid_hex_char(char c) { - if (c >= 'a' && c <= 'z') - return (c - 'a') + 'A'; - return c; + return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || + (c >= 'A' && c <= 'F'); } token_t tokenise_symbol(buffer_t *buffer, size_t *column) { - token_t token = {.type = TOKEN_SYMBOL, .str_size = 0, .column = *column}; - for (; token.str_size < space_left(buffer) && - is_symbol(buffer->data[buffer->used + token.str_size]); - ++token.str_size) - buffer->data[buffer->used + token.str_size] = - uppercase(buffer->data[buffer->used + token.str_size]); - token.str = calloc(token.str_size + 1, 1); - memcpy(token.str, buffer->data + buffer->used, token.str_size); - token.str[token.str_size] = '\0'; - buffer->used += token.str_size; - *column += token.str_size; - return token; + static_assert(NUMBER_OF_OPCODES == 70, "tokenise_buffer: Out of date!"); + + size_t sym_size = 0; + for (; sym_size < space_left(buffer) && + is_symbol(buffer->data[buffer->used + sym_size]); + ++sym_size) + buffer->data[buffer->used + sym_size] = + uppercase(buffer->data[buffer->used + sym_size]); + + token_t ret = {0}; + char *opcode = (char *)buffer->data + buffer->used; + + bool is_opcode = true; + token_type_t type = 0; + size_t offset = 0; + + if (sym_size == 4 && strncmp(opcode, "NOOP", 4) == 0) + { + offset = 4; + type = TOKEN_NOOP; + } + else if (sym_size == 4 && strncmp(opcode, "HALT", 4) == 0) + { + offset = 4; + type = TOKEN_HALT; + } + else if (sym_size >= 8 && strncmp(opcode, "PUSH.REG", 8) == 0) + { + offset = 8; + type = TOKEN_PUSH_REG; + } + else if (sym_size >= 4 && strncmp(opcode, "PUSH", 4) == 0) + { + offset = 4; + type = TOKEN_PUSH; + } + else if (sym_size >= 3 && strncmp(opcode, "POP", 3) == 0) + { + offset = 3; + type = TOKEN_POP; + } + else if (sym_size >= 3 && strncmp(opcode, "MOV", 3) == 0) + { + offset = 3; + type = TOKEN_MOV; + } + else if (sym_size >= 3 && strncmp(opcode, "DUP", 3) == 0) + { + offset = 3; + type = TOKEN_DUP; + } + else if (sym_size >= 3 && strncmp(opcode, "NOT", 3) == 0) + { + offset = 3; + type = TOKEN_NOT; + } + else if (sym_size >= 2 && strncmp(opcode, "OR", 2) == 0) + { + offset = 2; + type = TOKEN_OR; + } + else if (sym_size >= 3 && strncmp(opcode, "AND", 3) == 0) + { + offset = 3; + type = TOKEN_AND; + } + else if (sym_size >= 3 && strncmp(opcode, "XOR", 3) == 0) + { + offset = 3; + type = TOKEN_XOR; + } + else if (sym_size >= 2 && strncmp(opcode, "EQ", 2) == 0) + { + offset = 2; + type = TOKEN_EQ; + } + else if (sym_size >= 3 && strncmp(opcode, "LTE", 3) == 0) + { + offset = 3; + type = TOKEN_LTE; + } + else if (sym_size >= 2 && strncmp(opcode, "LT", 2) == 0) + { + offset = 2; + type = TOKEN_LT; + } + else if (sym_size >= 3 && strncmp(opcode, "GTE", 3) == 0) + { + offset = 3; + type = TOKEN_GTE; + } + else if (sym_size >= 2 && strncmp(opcode, "GT", 2) == 0) + { + offset = 2; + type = TOKEN_GT; + } + else if (sym_size >= 4 && strncmp(opcode, "PLUS", 4) == 0) + { + offset = 4; + type = TOKEN_PLUS; + } + else if (sym_size >= 5 && strncmp(opcode, "PRINT", 5) == 0) + { + offset = 5; + type = TOKEN_PRINT; + } + else if (sym_size >= 7 && strncmp(opcode, "JUMP.IF", 7) == 0) + { + offset = 7; + type = TOKEN_JUMP_IF; + } + else if (sym_size >= 4 && strncmp(opcode, "JUMP", 4) == 0) + { + offset = 4; + type = TOKEN_JUMP; + } + else + is_opcode = false; + + if (!is_opcode) + { + // Just a symbol, so no further manipulation + char *sym = malloc(sym_size + 1); + memcpy(sym, opcode, sym_size); + sym[sym_size] = '\0'; + ret = (token_t){.type = TOKEN_SYMBOL, + .str = sym, + .column = *column, + .str_size = sym_size}; + } + else + { + ret.type = type; + ret.column = *column; + if (offset == sym_size) + { + // There's no more to the string + ret.str = malloc(1); + ret.str[0] = '\0'; + } + else + { + // t.str is the remaining part of the string after the + // opcode + ret.str = calloc(sym_size - offset + 1, 1); + memcpy(ret.str, opcode + offset, sym_size - offset); + ret.str[sym_size - offset] = '\0'; + } + ret.str_size = sym_size - offset; + } + *column += sym_size; + buffer->used += sym_size; + return ret; } token_t tokenise_number_literal(buffer_t *buffer, size_t *column) @@ -98,12 +288,6 @@ token_t tokenise_number_literal(buffer_t *buffer, size_t *column) return token; } -bool is_valid_hex_char(char c) -{ - return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || - (c >= 'A' && c <= 'F'); -} - token_t tokenise_hex_literal(buffer_t *buffer, size_t *column) { // For the x part of the literal @@ -125,7 +309,6 @@ token_t tokenise_hex_literal(buffer_t *buffer, size_t *column) // Setup the first two characters token.str_size += 2; - printf("hex_literal: %s, %lu\n", token.str, token.str_size); return token; } @@ -133,8 +316,9 @@ token_t tokenise_char_literal(buffer_t *buffer, size_t *column) { token_t token = { .type = TOKEN_LITERAL_CHAR, .str_size = 1, .column = *column}; - token.str = calloc(1, 1); + token.str = calloc(2, 1); token.str[0] = buffer->data[buffer->used + 1]; + token.str[1] = '\0'; buffer->used += 3; *column += 3; return token; @@ -184,7 +368,146 @@ lerr_t tokenise_buffer(buffer_t *buffer, token_stream_t *tokens_ptr) is_valid_hex_char(buffer->data[buffer->used + 1])) t = tokenise_hex_literal(buffer, &column); else if (is_symbol(c)) - t = tokenise_symbol(buffer, &column); + { + static_assert(NUMBER_OF_OPCODES == 70, "tokenise_buffer: Out of date!"); + token_t token = tokenise_symbol(buffer, &column); + char *opcode = token.str; + + bool is_opcode = true; + token_type_t type = 0; + size_t offset = 0; + + if (token.str_size == 4 && strncmp(opcode, "NOOP", 4) == 0) + { + offset = 4; + type = TOKEN_NOOP; + } + else if (token.str_size == 4 && strncmp(opcode, "HALT", 4) == 0) + { + offset = 4; + type = TOKEN_HALT; + } + else if (token.str_size >= 8 && strncmp(opcode, "PUSH.REG", 8) == 0) + { + offset = 8; + type = TOKEN_PUSH; + } + else if (token.str_size >= 4 && strncmp(opcode, "PUSH", 4) == 0) + { + offset = 4; + type = TOKEN_PUSH; + } + else if (token.str_size >= 3 && strncmp(opcode, "POP", 3) == 0) + { + offset = 3; + type = TOKEN_POP; + } + else if (token.str_size >= 3 && strncmp(opcode, "MOV", 3) == 0) + { + offset = 3; + type = TOKEN_MOV; + } + else if (token.str_size >= 3 && strncmp(opcode, "DUP", 3) == 0) + { + offset = 3; + type = TOKEN_DUP; + } + else if (token.str_size >= 3 && strncmp(opcode, "NOT", 3) == 0) + { + offset = 3; + type = TOKEN_NOT; + } + else if (token.str_size >= 2 && strncmp(opcode, "OR", 2) == 0) + { + offset = 2; + type = TOKEN_OR; + } + else if (token.str_size >= 3 && strncmp(opcode, "AND", 3) == 0) + { + offset = 3; + type = TOKEN_AND; + } + else if (token.str_size >= 3 && strncmp(opcode, "XOR", 3) == 0) + { + offset = 3; + type = TOKEN_XOR; + } + else if (token.str_size >= 2 && strncmp(opcode, "EQ", 2) == 0) + { + offset = 2; + type = TOKEN_EQ; + } + else if (token.str_size >= 3 && strncmp(opcode, "LTE", 3) == 0) + { + offset = 3; + type = TOKEN_LTE; + } + else if (token.str_size >= 2 && strncmp(opcode, "LT", 2) == 0) + { + offset = 2; + type = TOKEN_LT; + } + else if (token.str_size >= 3 && strncmp(opcode, "GTE", 3) == 0) + { + offset = 3; + type = TOKEN_GTE; + } + else if (token.str_size >= 2 && strncmp(opcode, "GT", 2) == 0) + { + offset = 2; + type = TOKEN_GT; + } + else if (token.str_size >= 4 && strncmp(opcode, "PLUS", 4) == 0) + { + offset = 4; + type = TOKEN_PLUS; + } + else if (token.str_size >= 5 && strncmp(opcode, "PRINT", 5) == 0) + { + offset = 5; + type = TOKEN_PRINT; + } + else if (token.str_size >= 1 && strncmp(opcode, "JUMP.IF", 7) == 0) + { + offset = 7; + type = TOKEN_JUMP_IF; + } + else if (token.str_size >= 6 && strncmp(opcode, "JUMP", 6) == 0) + { + offset = 6; + type = TOKEN_JUMP; + } + else + { + is_opcode = false; + t = token; + } + + if (!is_opcode) + // Just a symbol, so no further manipulation + t = token; + else + { + t.type = type; + t.column = token.column; + if (offset == token.str_size) + { + // There's no more to the string + t.str = malloc(1); + t.str[0] = '\0'; + } + else + { + // t.str is the remaining part of the string after the + // opcode + t.str = calloc(token.str_size - offset + 1, 1); + memcpy(t.str, token.str + offset, token.str_size - offset); + t.str[token.str_size - offset] = '\0'; + } + t.str_size = token.str_size - offset; + free(token.str); + } + } else if (c == '\'') { if (space_left(buffer) < 2) @@ -222,12 +545,13 @@ lerr_t tokenise_buffer(buffer_t *buffer, token_stream_t *tokens_ptr) } t = (token_t){.type = TOKEN_LITERAL_CHAR, - .str = malloc(1), + .str = malloc(2), .str_size = 1, .column = column}; - column += 4; + column += 2; buffer->used += 4; t.str[0] = escape; + t.str[1] = '\0'; } else t = tokenise_char_literal(buffer, &column); diff --git a/asm/lexer.h b/asm/lexer.h index 1e68d8b..78cf4c6 100644 --- a/asm/lexer.h +++ b/asm/lexer.h @@ -19,6 +19,26 @@ typedef enum TokenType { TOKEN_LITERAL_NUMBER, TOKEN_LITERAL_CHAR, + TOKEN_NOOP, + TOKEN_HALT, + TOKEN_PUSH, + TOKEN_POP, + TOKEN_PUSH_REG, + TOKEN_MOV, + TOKEN_DUP, + TOKEN_NOT, + TOKEN_OR, + TOKEN_AND, + TOKEN_XOR, + TOKEN_EQ, + TOKEN_LT, + TOKEN_LTE, + TOKEN_GT, + TOKEN_GTE, + TOKEN_PLUS, + TOKEN_PRINT, + TOKEN_JUMP, + TOKEN_JUMP_IF, TOKEN_SYMBOL, } token_type_t; |