From 94abb9e3d43e7559befb0aa02894227ec02eab41 Mon Sep 17 00:00:00 2001 From: Aryadev Chavali Date: Wed, 25 Oct 2023 21:43:13 +0100 Subject: Separated lexer from main file in asm --- Makefile | 2 +- asm/lexer.c | 159 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ asm/lexer.h | 39 +++++++++++++++ asm/main.c | 157 ++--------------------------------------------------------- 4 files changed, 202 insertions(+), 155 deletions(-) create mode 100644 asm/lexer.c create mode 100644 asm/lexer.h diff --git a/Makefile b/Makefile index b6ec0b3..2d2e538 100644 --- a/Makefile +++ b/Makefile @@ -31,7 +31,7 @@ VM_OUT=$(DIST)/ovm.out ## ASSEMBLY setup ASM_DIST=$(DIST)/asm ASM_SRC=asm -ASM_CODE:=$(addprefix $(ASM_SRC)/, ) +ASM_CODE:=$(addprefix $(ASM_SRC)/, lexer.c) ASM_OBJECTS:=$(ASM_CODE:$(ASM_SRC)/%.c=$(ASM_DIST)/%.o) ASM_DEPS:=$(ASM_OBJECTS:%.o=%.d) $(ASM_DIST)/main.d ASM_CFLAGS=$(CFLAGS) diff --git a/asm/lexer.c b/asm/lexer.c new file mode 100644 index 0000000..03f7d05 --- /dev/null +++ b/asm/lexer.c @@ -0,0 +1,159 @@ +/* Copyright (C) 2023 Aryadev Chavali + + * You may distribute and modify this code under the terms of the GPLv2 + * license. You should have received a copy of the GPLv2 license with + * this file. If not, please write to: aryadev@aryadevchavali.com. + + * Created: 2023-10-24 + * Author: Aryadev Chavali + * Description: Lexer for assembly language + */ + +#include +#include +#include +#include + +#include "./lexer.h" + +const char *token_type_as_cstr(token_type_t type) +{ + switch (type) + { + case TOKEN_LITERAL_BYTE: + return "LITERAL_BYTE"; + case TOKEN_LITERAL_HWORD: + return "LITERAL_HWORD"; + case TOKEN_LITERAL_WORD: + return "LITERAL_WORD"; + case TOKEN_LITERAL_CHAR: + return "LITERAL_CHAR"; + case TOKEN_SYMBOL: + return "SYMBOL"; + } + return ""; +} + +size_t space_left(buffer_t *buffer) +{ + if (buffer->available == buffer->used) + return 0; + return buffer->available - 1 - buffer->used; +} + +bool is_symbol(char c) +{ + return isalpha(c) || c == '-' || c == '_'; +} + +token_t tokenise_symbol(buffer_t *buffer) +{ + token_t token = {.type = TOKEN_SYMBOL, .str_size = 0}; + for (; token.str_size < space_left(buffer) && + is_symbol(buffer->data[buffer->used + token.str_size]); + ++token.str_size) + continue; + token.str = calloc(token.str_size + 1, 1); + memcpy(token.str, buffer->data + buffer->used, token.str_size); + token.str[token.str_size] = '\0'; + buffer->used += token.str_size; + return token; +} + +void tokenise_literal(buffer_t *buffer, token_t *token) +{ + token->str_size = 0; + for (; token->str_size < space_left(buffer) && + isdigit(buffer->data[buffer->used + token->str_size]); + ++token->str_size) + continue; + token->str = calloc(token->str_size + 1, 1); + memcpy(token->str, buffer->data + buffer->used, token->str_size); + token->str[token->str_size] = '\0'; + buffer->used += token->str_size; +} + +token_t tokenise_byte_literal(buffer_t *buffer) +{ + token_t token = {.type = TOKEN_LITERAL_BYTE}; + tokenise_literal(buffer, &token); + return token; +} + +token_t tokenise_hword_literal(buffer_t *buffer) +{ + token_t token = {.type = TOKEN_LITERAL_HWORD}; + tokenise_literal(buffer, &token); + return token; +} + +token_t tokenise_word_literal(buffer_t *buffer) +{ + token_t token = {.type = TOKEN_LITERAL_WORD}; + tokenise_literal(buffer, &token); + return token; +} + +token_t tokenise_char_literal(buffer_t *buffer) +{ + token_t token = {.type = TOKEN_LITERAL_CHAR, .str_size = 1}; + token.str = calloc(1, 1); + token.str[0] = buffer->data[buffer->used + 1]; + buffer->used += 3; + return token; +} + +token_t *tokenise_buffer(buffer_t *buffer, size_t *n_tokens) +{ + buffer_t tokens = {0}; + darr_init(&tokens, sizeof(token_t)); + while (space_left(buffer) != 0) + { + bool is_token = true; + token_t t = {0}; + char c = buffer->data[buffer->used]; + if (isspace(c) || c == '\0') + { + // Clean whitespace + for (; space_left(buffer) > 0 && (isspace(c) || c == '\0'); + ++buffer->used, c = buffer->data[buffer->used]) + continue; + is_token = false; + } + else if (space_left(buffer) > 1 && isdigit(buffer->data[buffer->used + 1])) + { + // Parsing numeric literals + switch (c) + { + case 'b': + buffer->used++; + t = tokenise_byte_literal(buffer); + break; + case 'h': + buffer->used++; + t = tokenise_hword_literal(buffer); + break; + case 'w': + buffer->used++; + t = tokenise_word_literal(buffer); + break; + default: + // TODO: Lex Error (INVALID_NUMERIC_LITERAL) + exit(1); + } + } + else if (is_symbol(c)) + t = tokenise_symbol(buffer); + else if (c == '\'') + { + if (space_left(buffer) < 2 || buffer->data[buffer->used + 2] != '\'') + // TODO: Lex Error (INVALID_CHAR_LITERAL) + exit(1); + t = tokenise_char_literal(buffer); + } + if (is_token) + darr_append_bytes(&tokens, (byte *)&t, sizeof(t)); + } + *n_tokens = tokens.used / sizeof(token_t); + return (token_t *)tokens.data; +} diff --git a/asm/lexer.h b/asm/lexer.h new file mode 100644 index 0000000..01badf4 --- /dev/null +++ b/asm/lexer.h @@ -0,0 +1,39 @@ +/* Copyright (C) 2023 Aryadev Chavali + + * You may distribute and modify this code under the terms of the GPLv2 + * license. You should have received a copy of the GPLv2 license with + * this file. If not, please write to: aryadev@aryadevchavali.com. + + * Created: 2023-10-24 + * Author: Aryadev Chavali + * Description: Lexer for assembly language + */ + +#ifndef LEXER_H +#define LEXER_H + +#include + +typedef enum TokenType +{ + TOKEN_LITERAL_BYTE, + TOKEN_LITERAL_CHAR, + TOKEN_LITERAL_HWORD, + TOKEN_LITERAL_WORD, + TOKEN_SYMBOL, +} token_type_t; + +typedef struct +{ + token_type_t type; + char *str; + size_t str_size; +} token_t; + +typedef darr_t buffer_t; + +const char *token_type_as_cstr(token_type_t type); + +token_t *tokenise_buffer(buffer_t *, size_t *); + +#endif diff --git a/asm/main.c b/asm/main.c index 042850a..2415fa1 100644 --- a/asm/main.c +++ b/asm/main.c @@ -10,165 +10,14 @@ * Description: Assembly source code compiler, targeting OVM */ -#include -#include -#include -#include - #include -typedef enum TokenType -{ - TOKEN_BYTE_LITERAL, - TOKEN_HWORD_LITERAL, - TOKEN_WORD_LITERAL, - TOKEN_SYMBOL, -} token_type_t; - -const char *token_type_as_cstr(token_type_t type) -{ - switch (type) - { - case TOKEN_BYTE_LITERAL: - return "BYTE_LITERAL"; - case TOKEN_HWORD_LITERAL: - return "HWORD_LITERAL"; - case TOKEN_WORD_LITERAL: - return "WORD_LITERAL"; - case TOKEN_SYMBOL: - return "SYMBOL"; - } - return ""; -} - -typedef struct -{ - token_type_t type; - char *str; - size_t str_size; -} token_t; - -// We can use darr_read_file as an in memory buffer of source code -// Then just create new darr's for tokens, then instructions. Then -// emit bytecode in the end. -typedef darr_t buffer_t; - -size_t space_left(buffer_t *buffer) -{ - if (buffer->available == buffer->used) - return 0; - return buffer->available - 1 - buffer->used; -} - -bool is_symbol(char c) -{ - return isalpha(c) || c == '-' || c == '_'; -} - -token_t tokenise_symbol(buffer_t *buffer) -{ - token_t token = {.type = TOKEN_SYMBOL, .str_size = 0}; - for (; token.str_size < space_left(buffer) && - is_symbol(buffer->data[buffer->used + token.str_size]); - ++token.str_size) - continue; - token.str = calloc(token.str_size + 1, 1); - memcpy(token.str, buffer->data + buffer->used, token.str_size); - token.str[token.str_size] = '\0'; - buffer->used += token.str_size; - return token; -} - -void tokenise_literal(buffer_t *buffer, token_t *token) -{ - token->str_size = 0; - for (; token->str_size < space_left(buffer) && - isdigit(buffer->data[buffer->used + token->str_size]); - ++token->str_size) - continue; - token->str = calloc(token->str_size + 1, 1); - memcpy(token->str, buffer->data + buffer->used, token->str_size); - token->str[token->str_size] = '\0'; - buffer->used += token->str_size; -} - -token_t tokenise_byte_literal(buffer_t *buffer) -{ - token_t token = {.type = TOKEN_BYTE_LITERAL}; - tokenise_literal(buffer, &token); - return token; -} - -token_t tokenise_hword_literal(buffer_t *buffer) -{ - token_t token = {.type = TOKEN_HWORD_LITERAL}; - tokenise_literal(buffer, &token); - return token; -} - -token_t tokenise_word_literal(buffer_t *buffer) -{ - token_t token = {.type = TOKEN_WORD_LITERAL}; - tokenise_literal(buffer, &token); - return token; -} - -token_t *tokenise_buffer(buffer_t *buffer, size_t *n_tokens) -{ - buffer_t tokens = {0}; - darr_init(&tokens, sizeof(token_t)); - while (space_left(buffer) != 0) - { - bool is_token = true; - token_t t = {0}; - char c = buffer->data[buffer->used]; - if (isspace(c) || c == '\0') - { - // Clean whitespace - for (; space_left(buffer) > 0 && (isspace(c) || c == '\0'); - ++buffer->used, c = buffer->data[buffer->used]) - continue; - is_token = false; - } - else if (space_left(buffer) > 1 && isdigit(buffer->data[buffer->used + 1])) - { - // Parsing literals - - switch (c) - { - case 'b': - buffer->used++; - t = tokenise_byte_literal(buffer); - break; - case 'h': - buffer->used++; - t = tokenise_hword_literal(buffer); - break; - case 'w': - buffer->used++; - t = tokenise_word_literal(buffer); - break; - default: - // TODO: Lex Error (INVALID_LITERAL) - fprintf(stderr, "[LEX_ERROR]: Invalid literal `%c`\n", c); - exit(1); - } - } - else if (is_symbol(c)) - { - t = tokenise_symbol(buffer); - } - if (is_token) - darr_append_bytes(&tokens, (byte *)&t, sizeof(t)); - } - *n_tokens = tokens.used / sizeof(token_t); - return (token_t *)tokens.data; -} +#include "./lexer.h" int main(void) { - FILE *fp = fopen("main.asm", "rb"); - buffer_t buffer = darr_read_file(fp); + FILE *fp = fopen("main.asm", "rb"); + darr_t buffer = darr_read_file(fp); fclose(fp); size_t n = 0; -- cgit v1.2.3-13-gbd6f