From dc96e12145374537079c848d2d5c29824003f932 Mon Sep 17 00:00:00 2001 From: Aryadev Chavali Date: Thu, 29 Jan 2026 03:43:04 +0000 Subject: [PATCH] parser -> lexer That's the real purpose of this module; it's not really generating an AST since ARL's syntax isn't tree like whatsoever. The next stage will be something closer to an AST, in the sense we'll be introducing: - Syntactical analysis - Type Checking --- Makefile | 4 +- .../arl/{parser/parser.h => lexer/lexer.h} | 32 ++--- include/arl/lexer/token.h | 73 +++++++++++ include/arl/parser/ast.h | 74 ----------- src/{parser/parser.c => lexer/lexer.c} | 86 ++++++------- src/lexer/token.c | 115 ++++++++++++++++++ src/main.c | 21 ++-- src/parser/ast.c | 115 ------------------ 8 files changed, 259 insertions(+), 261 deletions(-) rename include/arl/{parser/parser.h => lexer/lexer.h} (54%) create mode 100644 include/arl/lexer/token.h delete mode 100644 include/arl/parser/ast.h rename src/{parser/parser.c => lexer/lexer.c} (59%) create mode 100644 src/lexer/token.c delete mode 100644 src/parser/ast.c diff --git a/Makefile b/Makefile index 4b8cfb4..1980693 100644 --- a/Makefile +++ b/Makefile @@ -3,8 +3,8 @@ CC=cc DIST=build OUT=$(DIST)/arl.out -MODULES=. lib parser -UNITS=main lib/vec lib/sv parser/ast parser/parser +MODULES=. lib lexer +UNITS=main lib/vec lib/sv lexer/token lexer/lexer OBJECTS:=$(patsubst %,$(DIST)/%.o, $(UNITS)) LDFLAGS= diff --git a/include/arl/parser/parser.h b/include/arl/lexer/lexer.h similarity index 54% rename from include/arl/parser/parser.h rename to include/arl/lexer/lexer.h index e07918a..faef870 100644 --- a/include/arl/parser/parser.h +++ b/include/arl/lexer/lexer.h @@ -1,38 +1,38 @@ -/* parser.h: Parser which takes character buffers and yields an AST +/* lexer.h: Lexer which takes character buffers and yields a sequence of tokens. * Created: 2026-01-22 * Author: Aryadev Chavali * License: See end of file * Commentary: */ -#ifndef PARSER_H -#define PARSER_H +#ifndef LEXER_H +#define LEXER_H -#include +#include -/// Parser streams, utilised when generating an AST. +/// Token streams, utilised when lexing. typedef struct { u64 byte; sv_t contents; -} parse_stream_t; +} lex_stream_t; -/// Types of errors that may occur during parsing +/// Types of errors that may occur during lexing typedef enum { - PARSE_ERR_OK = 0, - PARSE_ERR_EXPECTED_SPEECH_MARKS, - PARSE_ERR_UNKNOWN_CHAR, -} parse_err_t; -const char *parse_err_to_string(parse_err_t err); + LEX_ERR_OK = 0, + LEX_ERR_EXPECTED_SPEECH_MARKS, + LEX_ERR_UNKNOWN_CHAR, +} lex_err_t; +const char *lex_err_to_string(lex_err_t err); -// Generates an AST from STREAM, storing it in OUT. Returns any errors it may -// generate. -parse_err_t parse(ast_t *out, parse_stream_t *stream); +// Generates a token stream from a lex_stream_t, storing it in OUT. Returns any +// errors it may generate. +lex_err_t lex_stream(token_stream_t *out, lex_stream_t *stream); // Computes the line and column that STREAM is currently pointing at in its // buffer, storing it in LINE and COL. -void parse_stream_get_line_col(parse_stream_t *stream, u64 *line, u64 *col); +void lex_stream_get_line_col(lex_stream_t *stream, u64 *line, u64 *col); #endif diff --git a/include/arl/lexer/token.h b/include/arl/lexer/token.h new file mode 100644 index 0000000..4719686 --- /dev/null +++ b/include/arl/lexer/token.h @@ -0,0 +1,73 @@ +/* token.h: General definition of tokens, and a sequence of them. + * Created: 2026-01-22 + * Author: Aryadev Chavali + * License: See end of file + * Commentary: + */ + +#ifndef TOKEN_H +#define TOKEN_H + +#include +#include +#include + +/// Types of tokens +typedef enum +{ + TOKEN_TYPE_KNOWN = 0, + TOKEN_TYPE_SYMBOL, + TOKEN_TYPE_STRING, + + NUM_TOKEN_TYPES, +} token_type_t; + +/// Known symbols which later stages would benefit from. +typedef enum +{ + TOKEN_KNOWN_PUTSTR, + NUM_TOKEN_KNOWNS, +} token_known_t; + +const char *token_known_to_cstr(token_known_t); + +/// Tokens are a tagged union +typedef struct +{ + u64 byte_location; + token_type_t type; + union + { + token_known_t as_known; + sv_t as_symbol; + sv_t as_string; + }; +} token_t; + +token_t token_known(u64 byte, token_known_t known); +token_t token_symbol(u64 byte, sv_t symbol); +token_t token_string(u64 byte, sv_t string); +void token_print(FILE *fp, token_t *token); + +/// Sequence of tokens +typedef struct +{ + vec_t vec; +} token_stream_t; + +void token_stream_free(token_stream_t *token); +void token_stream_print(FILE *fp, token_stream_t *token); + +#endif + +/* Copyright (C) 2026 Aryadev Chavali + + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the MIT License for details. + + * You may distribute and modify this code under the terms of the MIT License, + * which you should have received a copy of along with this program. If not, + * please go to . + + */ diff --git a/include/arl/parser/ast.h b/include/arl/parser/ast.h deleted file mode 100644 index 5ef8710..0000000 --- a/include/arl/parser/ast.h +++ /dev/null @@ -1,74 +0,0 @@ -/* ast.h: General definition of the AST and nodes within it. - * Created: 2026-01-22 - * Author: Aryadev Chavali - * License: See end of file - * Commentary: - */ - -#ifndef AST_H -#define AST_H - -#include -#include -#include - -/// Types the AST can encode -typedef enum -{ - AST_NODE_TYPE_KNOWN = 0, - AST_NODE_TYPE_SYMBOL, - AST_NODE_TYPE_STRING, - - NUM_AST_NODE_TYPES, -} ast_node_type_t; - -/// Known symbols - may reference callables or values. -typedef enum -{ - AST_KNOWN_PUTSTR, - - NUM_AST_KNOWNS, -} ast_known_t; - -const char *ast_known_to_cstr(ast_known_t); - -/// Node of the AST as a tagged union -typedef struct -{ - u64 byte_location; - ast_node_type_t type; - union - { - ast_known_t as_known; - sv_t as_symbol; - sv_t as_string; - }; -} ast_node_t; - -ast_node_t ast_node_known(u64 byte, ast_known_t known); -ast_node_t ast_node_symbol(u64 byte, sv_t symbol); -ast_node_t ast_node_string(u64 byte, sv_t string); -void ast_node_print(FILE *fp, ast_node_t *node); - -/// The AST as a flat collection of nodes -typedef struct -{ - vec_t nodes; -} ast_t; - -void ast_free(ast_t *ast); -void ast_print(FILE *fp, ast_t *ast); - -#endif - -/* Copyright (C) 2026 Aryadev Chavali - - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. See the MIT License for details. - - * You may distribute and modify this code under the terms of the MIT License, - * which you should have received a copy of along with this program. If not, - * please go to . - - */ diff --git a/src/parser/parser.c b/src/lexer/lexer.c similarity index 59% rename from src/parser/parser.c rename to src/lexer/lexer.c index abaabd0..147954e 100644 --- a/src/parser/parser.c +++ b/src/lexer/lexer.c @@ -1,44 +1,44 @@ -/* parser.c: Implementation of parser. +/* lexr.c: Implementation of lexr. * Created: 2026-01-22 * Author: Aryadev Chavali * License: See end of file - * Commentary: See /include/arl/parser/parser.h + * Commentary: See /include/arl/lexr/lexr.h */ #include #include +#include +#include #include -#include -#include /// Expected characters in a symbol static const char *SYMBOL_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&'()*+,-./" ":;<=>?@\\^_`{|}~0123456789"; -const char *parse_err_to_string(parse_err_t err) +const char *lex_err_to_string(lex_err_t err) { switch (err) { - case PARSE_ERR_OK: + case LEX_ERR_OK: return "OK"; - case PARSE_ERR_EXPECTED_SPEECH_MARKS: + case LEX_ERR_EXPECTED_SPEECH_MARKS: return "EXPECTED_SPEECH_MARKS"; - case PARSE_ERR_UNKNOWN_CHAR: + case LEX_ERR_UNKNOWN_CHAR: return "UNKNOWN_CHAR"; default: - FAIL("Unexpected parse_err_t value: %d\n", err); + FAIL("Unexpected lex_err_t value: %d\n", err); } } /// Prototypes for streams -bool stream_eos(parse_stream_t *stream); -char stream_peek(parse_stream_t *stream); -void stream_advance(parse_stream_t *stream, u64 size); -u64 stream_size(parse_stream_t *stream); +bool stream_eos(lex_stream_t *stream); +char stream_peek(lex_stream_t *stream); +void stream_advance(lex_stream_t *stream, u64 size); +u64 stream_size(lex_stream_t *stream); -void parse_stream_get_line_col(parse_stream_t *stream, u64 *line, u64 *col) +void lex_stream_get_line_col(lex_stream_t *stream, u64 *line, u64 *col) { assert(stream && line && col && "Expected valid pointers."); for (u64 i = 0; i < stream->byte; ++i) @@ -56,11 +56,11 @@ void parse_stream_get_line_col(parse_stream_t *stream, u64 *line, u64 *col) } } -/// Prototypes for parsing subroutines -parse_err_t parse_string(parse_stream_t *stream, ast_node_t *ret); -parse_err_t parse_symbol(parse_stream_t *stream, ast_node_t *ret); +/// Prototypes for lexing subroutines +lex_err_t lex_string(lex_stream_t *stream, token_t *ret); +lex_err_t lex_symbol(lex_stream_t *stream, token_t *ret); -parse_err_t parse(ast_t *out, parse_stream_t *stream) +lex_err_t lex_stream(token_stream_t *out, lex_stream_t *stream) { assert(out && stream && "Expected valid pointers"); while (!stream_eos(stream)) @@ -76,32 +76,32 @@ parse_err_t parse(ast_t *out, parse_stream_t *stream) } else if (cur == '"') { - // we make a copy for parse_string to mess with - ast_node_t ret = {0}; - parse_err_t perr = parse_string(stream, &ret); + // we make a copy for lex_string to mess with + token_t ret = {0}; + lex_err_t perr = lex_string(stream, &ret); if (perr) return perr; - vec_append(&out->nodes, &ret, sizeof(ret)); + vec_append(&out->vec, &ret, sizeof(ret)); } else if (strchr(SYMBOL_CHARS, cur) && !isdigit(cur)) { - // we make a copy for parse_symbol to mess with - ast_node_t ret = {0}; - parse_err_t perr = parse_symbol(stream, &ret); + // we make a copy for lex_symbol to mess with + token_t ret = {0}; + lex_err_t perr = lex_symbol(stream, &ret); if (perr) return perr; - vec_append(&out->nodes, &ret, sizeof(ret)); + vec_append(&out->vec, &ret, sizeof(ret)); } else { - return PARSE_ERR_UNKNOWN_CHAR; + return LEX_ERR_UNKNOWN_CHAR; } } - return PARSE_ERR_OK; + return LEX_ERR_OK; } -parse_err_t parse_string(parse_stream_t *stream, ast_node_t *ret) +lex_err_t lex_string(lex_stream_t *stream, token_t *ret) { // Increment the cursor just past the first speechmark stream_advance(stream, 1); @@ -111,46 +111,46 @@ parse_err_t parse_string(parse_stream_t *stream, ast_node_t *ret) // If we're at the edge of the stream, there must not have been any // speechmarks. if (string.size + stream->byte == stream_size(stream)) - return PARSE_ERR_EXPECTED_SPEECH_MARKS; + return LEX_ERR_EXPECTED_SPEECH_MARKS; // `string` is well defined, package and throw it back. - *ret = ast_node_string(stream->byte - 1, string); + *ret = token_string(stream->byte - 1, string); stream_advance(stream, string.size + 1); - return PARSE_ERR_OK; + return LEX_ERR_OK; } -parse_err_t parse_symbol(parse_stream_t *stream, ast_node_t *ret) +lex_err_t lex_symbol(lex_stream_t *stream, token_t *ret) { sv_t symbol = sv_chop_left(stream->contents, stream->byte); symbol.size = sv_while(symbol, SYMBOL_CHARS); // see if symbol is one of the already known symbols - static_assert(NUM_AST_KNOWNS == 1, "Expected number of AST_KNOWNs"); - for (ast_known_t i = 0; i < NUM_AST_KNOWNS; ++i) + static_assert(NUM_TOKEN_KNOWNS == 1, "Expected number of TOKEN_KNOWNs"); + for (token_known_t i = 0; i < NUM_TOKEN_KNOWNS; ++i) { - const char *possible_known = ast_known_to_cstr(i); + const char *possible_known = token_known_to_cstr(i); if (strlen(possible_known) == symbol.size && strncmp(possible_known, symbol.data, symbol.size) == 0) { // Found a matching known symbol - *ret = ast_node_known(stream->byte, i); + *ret = token_known(stream->byte, i); goto end; } } // otherwise, it must be a fresh symbol i.e. user defined - *ret = ast_node_symbol(stream->byte, symbol); + *ret = token_symbol(stream->byte, symbol); end: stream_advance(stream, symbol.size); - return PARSE_ERR_OK; + return LEX_ERR_OK; } -bool stream_eos(parse_stream_t *stream) +bool stream_eos(lex_stream_t *stream) { return stream->byte >= stream->contents.size; } -char stream_peek(parse_stream_t *stream) +char stream_peek(lex_stream_t *stream) { if (stream_eos(stream)) return '\0'; @@ -158,7 +158,7 @@ char stream_peek(parse_stream_t *stream) return stream->contents.data[stream->byte]; } -void stream_advance(parse_stream_t *stream, u64 size) +void stream_advance(lex_stream_t *stream, u64 size) { if (stream->byte + size >= stream->contents.size) stream->byte = stream->contents.size; @@ -166,7 +166,7 @@ void stream_advance(parse_stream_t *stream, u64 size) stream->byte += size; } -u64 stream_size(parse_stream_t *stream) +u64 stream_size(lex_stream_t *stream) { return stream->contents.size; } diff --git a/src/lexer/token.c b/src/lexer/token.c new file mode 100644 index 0000000..2aefc0a --- /dev/null +++ b/src/lexer/token.c @@ -0,0 +1,115 @@ +/* token.c: Implementation of TOKEN constructor/destructor functions + * Created: 2026-01-22 + * Author: Aryadev Chavali + * License: See end of file + * Commentary: See /include/arl/lexer/token.h. + */ + +#include +#include +#include + +const char *token_known_to_cstr(token_known_t known) +{ + switch (known) + { + case TOKEN_KNOWN_PUTSTR: + return "putstr"; + default: + FAIL("Unexpected TOKEN_KNOWN value: %d\n", known); + } +} + +token_t token_known(u64 byte, token_known_t known) +{ + return (token_t){ + .byte_location = byte, + .type = TOKEN_TYPE_KNOWN, + .as_known = known, + }; +} + +token_t token_string(u64 byte, sv_t string) +{ + return (token_t){ + .byte_location = byte, + .type = TOKEN_TYPE_STRING, + .as_string = string, + }; +} + +token_t token_symbol(u64 byte, sv_t symbol) +{ + return (token_t){ + .byte_location = byte, + .type = TOKEN_TYPE_SYMBOL, + .as_symbol = symbol, + }; +} + +void token_print(FILE *fp, token_t *token) +{ + if (!token) + { + fprintf(fp, "NIL"); + return; + } + switch (token->type) + { + case TOKEN_TYPE_KNOWN: + fprintf(fp, "KNOWN(%s)", token_known_to_cstr(token->as_known)); + break; + case TOKEN_TYPE_SYMBOL: + fprintf(fp, "SYMBOL(" PR_SV ")", SV_FMT(token->as_symbol)); + break; + case TOKEN_TYPE_STRING: + fprintf(fp, "STRING(" PR_SV ")", SV_FMT(token->as_string)); + break; + case NUM_TOKEN_TYPES: + default: + FAIL("Unexpected token type: %d\n", token->type); + } +} + +void token_stream_print(FILE *fp, token_stream_t *token) +{ + if (!token) + { + fprintf(fp, "{}"); + return; + } + fprintf(fp, "{"); + if (token->vec.size == 0) + { + fprintf(fp, "}\n"); + return; + } + + fprintf(fp, "\n"); + for (u64 i = 0; i < token->vec.size / sizeof(token_t); ++i) + { + token_t item = VEC_GET(&token->vec, i, token_t); + fprintf(fp, "\t[%lu]: ", i); + token_print(fp, &item); + fprintf(fp, "\n"); + } + fprintf(fp, "}"); +} + +void token_stream_free(token_stream_t *stream) +{ + // we can free the vector itself and we're done + vec_free(&stream->vec); +} + +/* Copyright (C) 2026 Aryadev Chavali + + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the MIT License for details. + + * You may distribute and modify this code under the terms of the MIT License, + * which you should have received a copy of along with this program. If not, + * please go to . + + */ diff --git a/src/main.c b/src/main.c index 5f495a0..977eca8 100644 --- a/src/main.c +++ b/src/main.c @@ -12,11 +12,11 @@ #include #include +#include +#include #include #include #include -#include -#include int read_file(const char *filename, sv_t *ret) { @@ -114,30 +114,29 @@ int main(int argc, char *argv[]) LOG("%s => `" PR_SV "`\n", filename, SV_FMT(contents)); - parse_stream_t stream = {.byte = 0, .contents = contents}; - ast_t ast = {0}; - parse_err_t perr = parse(&ast, &stream); + lex_stream_t stream = {.byte = 0, .contents = contents}; + token_stream_t tokens = {0}; + lex_err_t perr = lex_stream(&tokens, &stream); if (perr) { u64 line = 1, col = 0; - parse_stream_get_line_col(&stream, &line, &col); + lex_stream_get_line_col(&stream, &line, &col); - LOG_ERR("%s:%lu:%lu: %s\n", filename, line, col, parse_err_to_string(perr)); + LOG_ERR("%s:%lu:%lu: %s\n", filename, line, col, lex_err_to_string(perr)); ret = 1; goto end; } - LOG("Parsed %lu nodes\n", ast.nodes.size / sizeof(ast_node_t)); + LOG("Lexed %lu tokens\n", tokens.vec.size / sizeof(token_t)); #if VERBOSE_LOGS - ast_print(stdout, &ast); + token_stream_print(stdout, &tokens); #endif printf("\n"); end: if (contents.data) free(contents.data); - if (ast.nodes.capacity > 0) - ast_free(&ast); + token_stream_free(&tokens); return ret; } diff --git a/src/parser/ast.c b/src/parser/ast.c deleted file mode 100644 index c63331d..0000000 --- a/src/parser/ast.c +++ /dev/null @@ -1,115 +0,0 @@ -/* ast.c: Implementation of AST constructor/destructor functions - * Created: 2026-01-22 - * Author: Aryadev Chavali - * License: See end of file - * Commentary: See /include/arl/parser/ast.h. - */ - -#include -#include -#include - -const char *ast_known_to_cstr(ast_known_t known) -{ - switch (known) - { - case AST_KNOWN_PUTSTR: - return "putstr"; - default: - FAIL("Unexpected AST_KNOWN value: %d\n", known); - } -} - -ast_node_t ast_node_known(u64 byte, ast_known_t known) -{ - return (ast_node_t){ - .byte_location = byte, - .type = AST_NODE_TYPE_KNOWN, - .as_known = known, - }; -} - -ast_node_t ast_node_string(u64 byte, sv_t string) -{ - return (ast_node_t){ - .byte_location = byte, - .type = AST_NODE_TYPE_STRING, - .as_string = string, - }; -} - -ast_node_t ast_node_symbol(u64 byte, sv_t symbol) -{ - return (ast_node_t){ - .byte_location = byte, - .type = AST_NODE_TYPE_SYMBOL, - .as_symbol = symbol, - }; -} - -void ast_node_print(FILE *fp, ast_node_t *node) -{ - if (!node) - { - fprintf(fp, "NIL"); - return; - } - switch (node->type) - { - case AST_NODE_TYPE_KNOWN: - fprintf(fp, "KNOWN(%s)", ast_known_to_cstr(node->as_known)); - break; - case AST_NODE_TYPE_SYMBOL: - fprintf(fp, "SYMBOL(" PR_SV ")", SV_FMT(node->as_symbol)); - break; - case AST_NODE_TYPE_STRING: - fprintf(fp, "STRING(" PR_SV ")", SV_FMT(node->as_string)); - break; - case NUM_AST_NODE_TYPES: - default: - FAIL("Unexpected node type: %d\n", node->type); - } -} - -void ast_print(FILE *fp, ast_t *ast) -{ - if (!ast) - { - fprintf(fp, "{}"); - return; - } - fprintf(fp, "{"); - if (ast->nodes.size == 0) - { - fprintf(fp, "}\n"); - return; - } - - fprintf(fp, "\n"); - for (u64 i = 0; i < ast->nodes.size / sizeof(ast_node_t); ++i) - { - ast_node_t item = VEC_GET(&ast->nodes, i, ast_node_t); - fprintf(fp, "\t[%lu]: ", i); - ast_node_print(fp, &item); - fprintf(fp, "\n"); - } - fprintf(fp, "}"); -} - -void ast_free(ast_t *ast) -{ - // we can free the vector itself and we're done - vec_free(&ast->nodes); -} - -/* Copyright (C) 2026 Aryadev Chavali - - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. See the MIT License for details. - - * You may distribute and modify this code under the terms of the MIT License, - * which you should have received a copy of along with this program. If not, - * please go to . - - */