diff options
author | Aryadev Chavali <aryadev@aryadevchavali.com> | 2023-10-25 21:43:13 +0100 |
---|---|---|
committer | Aryadev Chavali <aryadev@aryadevchavali.com> | 2023-10-25 21:43:13 +0100 |
commit | 94abb9e3d43e7559befb0aa02894227ec02eab41 (patch) | |
tree | 56e17174850787fb7584e8817edf98ca7fcf29dd /asm/lexer.c | |
parent | dbbfac1236ad783ccace11058c972d9eb5ef7c6c (diff) | |
download | ovm-94abb9e3d43e7559befb0aa02894227ec02eab41.tar.gz ovm-94abb9e3d43e7559befb0aa02894227ec02eab41.tar.bz2 ovm-94abb9e3d43e7559befb0aa02894227ec02eab41.zip |
Separated lexer from main file in asm
Diffstat (limited to 'asm/lexer.c')
-rw-r--r-- | asm/lexer.c | 159 |
1 files changed, 159 insertions, 0 deletions
diff --git a/asm/lexer.c b/asm/lexer.c new file mode 100644 index 0000000..03f7d05 --- /dev/null +++ b/asm/lexer.c @@ -0,0 +1,159 @@ +/* Copyright (C) 2023 Aryadev Chavali + + * You may distribute and modify this code under the terms of the GPLv2 + * license. You should have received a copy of the GPLv2 license with + * this file. If not, please write to: aryadev@aryadevchavali.com. + + * Created: 2023-10-24 + * Author: Aryadev Chavali + * Description: Lexer for assembly language + */ + +#include <ctype.h> +#include <stdbool.h> +#include <stdio.h> +#include <string.h> + +#include "./lexer.h" + +const char *token_type_as_cstr(token_type_t type) +{ + switch (type) + { + case TOKEN_LITERAL_BYTE: + return "LITERAL_BYTE"; + case TOKEN_LITERAL_HWORD: + return "LITERAL_HWORD"; + case TOKEN_LITERAL_WORD: + return "LITERAL_WORD"; + case TOKEN_LITERAL_CHAR: + return "LITERAL_CHAR"; + case TOKEN_SYMBOL: + return "SYMBOL"; + } + return ""; +} + +size_t space_left(buffer_t *buffer) +{ + if (buffer->available == buffer->used) + return 0; + return buffer->available - 1 - buffer->used; +} + +bool is_symbol(char c) +{ + return isalpha(c) || c == '-' || c == '_'; +} + +token_t tokenise_symbol(buffer_t *buffer) +{ + token_t token = {.type = TOKEN_SYMBOL, .str_size = 0}; + for (; token.str_size < space_left(buffer) && + is_symbol(buffer->data[buffer->used + token.str_size]); + ++token.str_size) + continue; + token.str = calloc(token.str_size + 1, 1); + memcpy(token.str, buffer->data + buffer->used, token.str_size); + token.str[token.str_size] = '\0'; + buffer->used += token.str_size; + return token; +} + +void tokenise_literal(buffer_t *buffer, token_t *token) +{ + token->str_size = 0; + for (; token->str_size < space_left(buffer) && + isdigit(buffer->data[buffer->used + token->str_size]); + ++token->str_size) + continue; + token->str = calloc(token->str_size + 1, 1); + memcpy(token->str, buffer->data + buffer->used, token->str_size); + token->str[token->str_size] = '\0'; + buffer->used += token->str_size; +} + +token_t tokenise_byte_literal(buffer_t *buffer) +{ + token_t token = {.type = TOKEN_LITERAL_BYTE}; + tokenise_literal(buffer, &token); + return token; +} + +token_t tokenise_hword_literal(buffer_t *buffer) +{ + token_t token = {.type = TOKEN_LITERAL_HWORD}; + tokenise_literal(buffer, &token); + return token; +} + +token_t tokenise_word_literal(buffer_t *buffer) +{ + token_t token = {.type = TOKEN_LITERAL_WORD}; + tokenise_literal(buffer, &token); + return token; +} + +token_t tokenise_char_literal(buffer_t *buffer) +{ + token_t token = {.type = TOKEN_LITERAL_CHAR, .str_size = 1}; + token.str = calloc(1, 1); + token.str[0] = buffer->data[buffer->used + 1]; + buffer->used += 3; + return token; +} + +token_t *tokenise_buffer(buffer_t *buffer, size_t *n_tokens) +{ + buffer_t tokens = {0}; + darr_init(&tokens, sizeof(token_t)); + while (space_left(buffer) != 0) + { + bool is_token = true; + token_t t = {0}; + char c = buffer->data[buffer->used]; + if (isspace(c) || c == '\0') + { + // Clean whitespace + for (; space_left(buffer) > 0 && (isspace(c) || c == '\0'); + ++buffer->used, c = buffer->data[buffer->used]) + continue; + is_token = false; + } + else if (space_left(buffer) > 1 && isdigit(buffer->data[buffer->used + 1])) + { + // Parsing numeric literals + switch (c) + { + case 'b': + buffer->used++; + t = tokenise_byte_literal(buffer); + break; + case 'h': + buffer->used++; + t = tokenise_hword_literal(buffer); + break; + case 'w': + buffer->used++; + t = tokenise_word_literal(buffer); + break; + default: + // TODO: Lex Error (INVALID_NUMERIC_LITERAL) + exit(1); + } + } + else if (is_symbol(c)) + t = tokenise_symbol(buffer); + else if (c == '\'') + { + if (space_left(buffer) < 2 || buffer->data[buffer->used + 2] != '\'') + // TODO: Lex Error (INVALID_CHAR_LITERAL) + exit(1); + t = tokenise_char_literal(buffer); + } + if (is_token) + darr_append_bytes(&tokens, (byte *)&t, sizeof(t)); + } + *n_tokens = tokens.used / sizeof(token_t); + return (token_t *)tokens.data; +} |