diff options
author | Aryadev Chavali <aryadev@aryadevchavali.com> | 2023-10-24 18:20:59 +0100 |
---|---|---|
committer | Aryadev Chavali <aryadev@aryadevchavali.com> | 2023-10-24 18:20:59 +0100 |
commit | dbbfac1236ad783ccace11058c972d9eb5ef7c6c (patch) | |
tree | c3f1544e98fc35fda030b388b92a75e6ef9ccf45 | |
parent | 3aad3926d282ccc067eaafed735bddd210fba688 (diff) | |
download | ovm-dbbfac1236ad783ccace11058c972d9eb5ef7c6c.tar.gz ovm-dbbfac1236ad783ccace11058c972d9eb5ef7c6c.tar.bz2 ovm-dbbfac1236ad783ccace11058c972d9eb5ef7c6c.zip |
Wrote lexer for assembly
Pretty simple tokeniser, doesn't do a lot and needs to error check better.
-rw-r--r-- | asm/main.c | 154 |
1 files changed, 148 insertions, 6 deletions
@@ -10,19 +10,37 @@ * Description: Assembly source code compiler, targeting OVM */ +#include <ctype.h> +#include <stdbool.h> #include <stdio.h> +#include <string.h> #include <lib/darr.h> typedef enum TokenType { - TOKEN_WHITESPACE, TOKEN_BYTE_LITERAL, TOKEN_HWORD_LITERAL, TOKEN_WORD_LITERAL, TOKEN_SYMBOL, } token_type_t; +const char *token_type_as_cstr(token_type_t type) +{ + switch (type) + { + case TOKEN_BYTE_LITERAL: + return "BYTE_LITERAL"; + case TOKEN_HWORD_LITERAL: + return "HWORD_LITERAL"; + case TOKEN_WORD_LITERAL: + return "WORD_LITERAL"; + case TOKEN_SYMBOL: + return "SYMBOL"; + } + return ""; +} + typedef struct { token_type_t type; @@ -33,14 +51,138 @@ typedef struct // We can use darr_read_file as an in memory buffer of source code // Then just create new darr's for tokens, then instructions. Then // emit bytecode in the end. +typedef darr_t buffer_t; + +size_t space_left(buffer_t *buffer) +{ + if (buffer->available == buffer->used) + return 0; + return buffer->available - 1 - buffer->used; +} + +bool is_symbol(char c) +{ + return isalpha(c) || c == '-' || c == '_'; +} + +token_t tokenise_symbol(buffer_t *buffer) +{ + token_t token = {.type = TOKEN_SYMBOL, .str_size = 0}; + for (; token.str_size < space_left(buffer) && + is_symbol(buffer->data[buffer->used + token.str_size]); + ++token.str_size) + continue; + token.str = calloc(token.str_size + 1, 1); + memcpy(token.str, buffer->data + buffer->used, token.str_size); + token.str[token.str_size] = '\0'; + buffer->used += token.str_size; + return token; +} + +void tokenise_literal(buffer_t *buffer, token_t *token) +{ + token->str_size = 0; + for (; token->str_size < space_left(buffer) && + isdigit(buffer->data[buffer->used + token->str_size]); + ++token->str_size) + continue; + token->str = calloc(token->str_size + 1, 1); + memcpy(token->str, buffer->data + buffer->used, token->str_size); + token->str[token->str_size] = '\0'; + buffer->used += token->str_size; +} + +token_t tokenise_byte_literal(buffer_t *buffer) +{ + token_t token = {.type = TOKEN_BYTE_LITERAL}; + tokenise_literal(buffer, &token); + return token; +} + +token_t tokenise_hword_literal(buffer_t *buffer) +{ + token_t token = {.type = TOKEN_HWORD_LITERAL}; + tokenise_literal(buffer, &token); + return token; +} + +token_t tokenise_word_literal(buffer_t *buffer) +{ + token_t token = {.type = TOKEN_WORD_LITERAL}; + tokenise_literal(buffer, &token); + return token; +} + +token_t *tokenise_buffer(buffer_t *buffer, size_t *n_tokens) +{ + buffer_t tokens = {0}; + darr_init(&tokens, sizeof(token_t)); + while (space_left(buffer) != 0) + { + bool is_token = true; + token_t t = {0}; + char c = buffer->data[buffer->used]; + if (isspace(c) || c == '\0') + { + // Clean whitespace + for (; space_left(buffer) > 0 && (isspace(c) || c == '\0'); + ++buffer->used, c = buffer->data[buffer->used]) + continue; + is_token = false; + } + else if (space_left(buffer) > 1 && isdigit(buffer->data[buffer->used + 1])) + { + // Parsing literals + + switch (c) + { + case 'b': + buffer->used++; + t = tokenise_byte_literal(buffer); + break; + case 'h': + buffer->used++; + t = tokenise_hword_literal(buffer); + break; + case 'w': + buffer->used++; + t = tokenise_word_literal(buffer); + break; + default: + // TODO: Lex Error (INVALID_LITERAL) + fprintf(stderr, "[LEX_ERROR]: Invalid literal `%c`\n", c); + exit(1); + } + } + else if (is_symbol(c)) + { + t = tokenise_symbol(buffer); + } + if (is_token) + darr_append_bytes(&tokens, (byte *)&t, sizeof(t)); + } + *n_tokens = tokens.used / sizeof(token_t); + return (token_t *)tokens.data; +} int main(void) { - FILE *fp = fopen("main.asm", "rb"); - darr_t darr = darr_read_file(fp); + FILE *fp = fopen("main.asm", "rb"); + buffer_t buffer = darr_read_file(fp); fclose(fp); - darr.data[darr.available - 1] = '\0'; - printf("%lu/%lu\n%s\n", darr.used, darr.available, darr.data); - free(darr.data); + + size_t n = 0; + token_t *tokens = tokenise_buffer(&buffer, &n); + printf("%lu bytes -> %lu tokens\n", buffer.used, n); + free(buffer.data); + + for (size_t i = 0; i < n; ++i) + printf("%s(%.*s)\n", token_type_as_cstr(tokens[i].type), + (int)tokens[i].str_size, tokens[i].str); + + // Free the tokens + for (size_t i = 0; i < n; ++i) + free(tokens[i].str); + free(tokens); return 0; } |