parser -> lexer

That's the real purpose of this module; it's not really generating an
AST since ARL's syntax isn't tree like whatsoever.

The next stage will be something closer to an AST, in the sense we'll
be introducing:
- Syntactical analysis
- Type Checking
This commit is contained in:
2026-01-29 03:43:04 +00:00
parent 42ac4f6bbb
commit dc96e12145
8 changed files with 259 additions and 261 deletions

View File

@@ -3,8 +3,8 @@ CC=cc
DIST=build DIST=build
OUT=$(DIST)/arl.out OUT=$(DIST)/arl.out
MODULES=. lib parser MODULES=. lib lexer
UNITS=main lib/vec lib/sv parser/ast parser/parser UNITS=main lib/vec lib/sv lexer/token lexer/lexer
OBJECTS:=$(patsubst %,$(DIST)/%.o, $(UNITS)) OBJECTS:=$(patsubst %,$(DIST)/%.o, $(UNITS))
LDFLAGS= LDFLAGS=

View File

@@ -1,38 +1,38 @@
/* parser.h: Parser which takes character buffers and yields an AST /* lexer.h: Lexer which takes character buffers and yields a sequence of tokens.
* Created: 2026-01-22 * Created: 2026-01-22
* Author: Aryadev Chavali * Author: Aryadev Chavali
* License: See end of file * License: See end of file
* Commentary: * Commentary:
*/ */
#ifndef PARSER_H #ifndef LEXER_H
#define PARSER_H #define LEXER_H
#include <arl/parser/ast.h> #include <arl/lexer/token.h>
/// Parser streams, utilised when generating an AST. /// Token streams, utilised when lexing.
typedef struct typedef struct
{ {
u64 byte; u64 byte;
sv_t contents; sv_t contents;
} parse_stream_t; } lex_stream_t;
/// Types of errors that may occur during parsing /// Types of errors that may occur during lexing
typedef enum typedef enum
{ {
PARSE_ERR_OK = 0, LEX_ERR_OK = 0,
PARSE_ERR_EXPECTED_SPEECH_MARKS, LEX_ERR_EXPECTED_SPEECH_MARKS,
PARSE_ERR_UNKNOWN_CHAR, LEX_ERR_UNKNOWN_CHAR,
} parse_err_t; } lex_err_t;
const char *parse_err_to_string(parse_err_t err); const char *lex_err_to_string(lex_err_t err);
// Generates an AST from STREAM, storing it in OUT. Returns any errors it may // Generates a token stream from a lex_stream_t, storing it in OUT. Returns any
// generate. // errors it may generate.
parse_err_t parse(ast_t *out, parse_stream_t *stream); lex_err_t lex_stream(token_stream_t *out, lex_stream_t *stream);
// Computes the line and column that STREAM is currently pointing at in its // Computes the line and column that STREAM is currently pointing at in its
// buffer, storing it in LINE and COL. // buffer, storing it in LINE and COL.
void parse_stream_get_line_col(parse_stream_t *stream, u64 *line, u64 *col); void lex_stream_get_line_col(lex_stream_t *stream, u64 *line, u64 *col);
#endif #endif

73
include/arl/lexer/token.h Normal file
View File

@@ -0,0 +1,73 @@
/* token.h: General definition of tokens, and a sequence of them.
* Created: 2026-01-22
* Author: Aryadev Chavali
* License: See end of file
* Commentary:
*/
#ifndef TOKEN_H
#define TOKEN_H
#include <arl/lib/base.h>
#include <arl/lib/sv.h>
#include <arl/lib/vec.h>
/// Types of tokens
typedef enum
{
TOKEN_TYPE_KNOWN = 0,
TOKEN_TYPE_SYMBOL,
TOKEN_TYPE_STRING,
NUM_TOKEN_TYPES,
} token_type_t;
/// Known symbols which later stages would benefit from.
typedef enum
{
TOKEN_KNOWN_PUTSTR,
NUM_TOKEN_KNOWNS,
} token_known_t;
const char *token_known_to_cstr(token_known_t);
/// Tokens are a tagged union
typedef struct
{
u64 byte_location;
token_type_t type;
union
{
token_known_t as_known;
sv_t as_symbol;
sv_t as_string;
};
} token_t;
token_t token_known(u64 byte, token_known_t known);
token_t token_symbol(u64 byte, sv_t symbol);
token_t token_string(u64 byte, sv_t string);
void token_print(FILE *fp, token_t *token);
/// Sequence of tokens
typedef struct
{
vec_t vec;
} token_stream_t;
void token_stream_free(token_stream_t *token);
void token_stream_print(FILE *fp, token_stream_t *token);
#endif
/* Copyright (C) 2026 Aryadev Chavali
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the MIT License for details.
* You may distribute and modify this code under the terms of the MIT License,
* which you should have received a copy of along with this program. If not,
* please go to <https://opensource.org/license/MIT>.
*/

View File

@@ -1,74 +0,0 @@
/* ast.h: General definition of the AST and nodes within it.
* Created: 2026-01-22
* Author: Aryadev Chavali
* License: See end of file
* Commentary:
*/
#ifndef AST_H
#define AST_H
#include <arl/lib/base.h>
#include <arl/lib/sv.h>
#include <arl/lib/vec.h>
/// Types the AST can encode
typedef enum
{
AST_NODE_TYPE_KNOWN = 0,
AST_NODE_TYPE_SYMBOL,
AST_NODE_TYPE_STRING,
NUM_AST_NODE_TYPES,
} ast_node_type_t;
/// Known symbols - may reference callables or values.
typedef enum
{
AST_KNOWN_PUTSTR,
NUM_AST_KNOWNS,
} ast_known_t;
const char *ast_known_to_cstr(ast_known_t);
/// Node of the AST as a tagged union
typedef struct
{
u64 byte_location;
ast_node_type_t type;
union
{
ast_known_t as_known;
sv_t as_symbol;
sv_t as_string;
};
} ast_node_t;
ast_node_t ast_node_known(u64 byte, ast_known_t known);
ast_node_t ast_node_symbol(u64 byte, sv_t symbol);
ast_node_t ast_node_string(u64 byte, sv_t string);
void ast_node_print(FILE *fp, ast_node_t *node);
/// The AST as a flat collection of nodes
typedef struct
{
vec_t nodes;
} ast_t;
void ast_free(ast_t *ast);
void ast_print(FILE *fp, ast_t *ast);
#endif
/* Copyright (C) 2026 Aryadev Chavali
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the MIT License for details.
* You may distribute and modify this code under the terms of the MIT License,
* which you should have received a copy of along with this program. If not,
* please go to <https://opensource.org/license/MIT>.
*/

View File

@@ -1,44 +1,44 @@
/* parser.c: Implementation of parser. /* lexr.c: Implementation of lexr.
* Created: 2026-01-22 * Created: 2026-01-22
* Author: Aryadev Chavali * Author: Aryadev Chavali
* License: See end of file * License: See end of file
* Commentary: See /include/arl/parser/parser.h * Commentary: See /include/arl/lexr/lexr.h
*/ */
#include <ctype.h> #include <ctype.h>
#include <string.h> #include <string.h>
#include <arl/lexer/lexer.h>
#include <arl/lexer/token.h>
#include <arl/lib/sv.h> #include <arl/lib/sv.h>
#include <arl/parser/ast.h>
#include <arl/parser/parser.h>
/// Expected characters in a symbol /// Expected characters in a symbol
static const char *SYMBOL_CHARS = static const char *SYMBOL_CHARS =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&'()*+,-./" "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&'()*+,-./"
":;<=>?@\\^_`{|}~0123456789"; ":;<=>?@\\^_`{|}~0123456789";
const char *parse_err_to_string(parse_err_t err) const char *lex_err_to_string(lex_err_t err)
{ {
switch (err) switch (err)
{ {
case PARSE_ERR_OK: case LEX_ERR_OK:
return "OK"; return "OK";
case PARSE_ERR_EXPECTED_SPEECH_MARKS: case LEX_ERR_EXPECTED_SPEECH_MARKS:
return "EXPECTED_SPEECH_MARKS"; return "EXPECTED_SPEECH_MARKS";
case PARSE_ERR_UNKNOWN_CHAR: case LEX_ERR_UNKNOWN_CHAR:
return "UNKNOWN_CHAR"; return "UNKNOWN_CHAR";
default: default:
FAIL("Unexpected parse_err_t value: %d\n", err); FAIL("Unexpected lex_err_t value: %d\n", err);
} }
} }
/// Prototypes for streams /// Prototypes for streams
bool stream_eos(parse_stream_t *stream); bool stream_eos(lex_stream_t *stream);
char stream_peek(parse_stream_t *stream); char stream_peek(lex_stream_t *stream);
void stream_advance(parse_stream_t *stream, u64 size); void stream_advance(lex_stream_t *stream, u64 size);
u64 stream_size(parse_stream_t *stream); u64 stream_size(lex_stream_t *stream);
void parse_stream_get_line_col(parse_stream_t *stream, u64 *line, u64 *col) void lex_stream_get_line_col(lex_stream_t *stream, u64 *line, u64 *col)
{ {
assert(stream && line && col && "Expected valid pointers."); assert(stream && line && col && "Expected valid pointers.");
for (u64 i = 0; i < stream->byte; ++i) for (u64 i = 0; i < stream->byte; ++i)
@@ -56,11 +56,11 @@ void parse_stream_get_line_col(parse_stream_t *stream, u64 *line, u64 *col)
} }
} }
/// Prototypes for parsing subroutines /// Prototypes for lexing subroutines
parse_err_t parse_string(parse_stream_t *stream, ast_node_t *ret); lex_err_t lex_string(lex_stream_t *stream, token_t *ret);
parse_err_t parse_symbol(parse_stream_t *stream, ast_node_t *ret); lex_err_t lex_symbol(lex_stream_t *stream, token_t *ret);
parse_err_t parse(ast_t *out, parse_stream_t *stream) lex_err_t lex_stream(token_stream_t *out, lex_stream_t *stream)
{ {
assert(out && stream && "Expected valid pointers"); assert(out && stream && "Expected valid pointers");
while (!stream_eos(stream)) while (!stream_eos(stream))
@@ -76,32 +76,32 @@ parse_err_t parse(ast_t *out, parse_stream_t *stream)
} }
else if (cur == '"') else if (cur == '"')
{ {
// we make a copy for parse_string to mess with // we make a copy for lex_string to mess with
ast_node_t ret = {0}; token_t ret = {0};
parse_err_t perr = parse_string(stream, &ret); lex_err_t perr = lex_string(stream, &ret);
if (perr) if (perr)
return perr; return perr;
vec_append(&out->nodes, &ret, sizeof(ret)); vec_append(&out->vec, &ret, sizeof(ret));
} }
else if (strchr(SYMBOL_CHARS, cur) && !isdigit(cur)) else if (strchr(SYMBOL_CHARS, cur) && !isdigit(cur))
{ {
// we make a copy for parse_symbol to mess with // we make a copy for lex_symbol to mess with
ast_node_t ret = {0}; token_t ret = {0};
parse_err_t perr = parse_symbol(stream, &ret); lex_err_t perr = lex_symbol(stream, &ret);
if (perr) if (perr)
return perr; return perr;
vec_append(&out->nodes, &ret, sizeof(ret)); vec_append(&out->vec, &ret, sizeof(ret));
} }
else else
{ {
return PARSE_ERR_UNKNOWN_CHAR; return LEX_ERR_UNKNOWN_CHAR;
} }
} }
return PARSE_ERR_OK; return LEX_ERR_OK;
} }
parse_err_t parse_string(parse_stream_t *stream, ast_node_t *ret) lex_err_t lex_string(lex_stream_t *stream, token_t *ret)
{ {
// Increment the cursor just past the first speechmark // Increment the cursor just past the first speechmark
stream_advance(stream, 1); stream_advance(stream, 1);
@@ -111,46 +111,46 @@ parse_err_t parse_string(parse_stream_t *stream, ast_node_t *ret)
// If we're at the edge of the stream, there must not have been any // If we're at the edge of the stream, there must not have been any
// speechmarks. // speechmarks.
if (string.size + stream->byte == stream_size(stream)) if (string.size + stream->byte == stream_size(stream))
return PARSE_ERR_EXPECTED_SPEECH_MARKS; return LEX_ERR_EXPECTED_SPEECH_MARKS;
// `string` is well defined, package and throw it back. // `string` is well defined, package and throw it back.
*ret = ast_node_string(stream->byte - 1, string); *ret = token_string(stream->byte - 1, string);
stream_advance(stream, string.size + 1); stream_advance(stream, string.size + 1);
return PARSE_ERR_OK; return LEX_ERR_OK;
} }
parse_err_t parse_symbol(parse_stream_t *stream, ast_node_t *ret) lex_err_t lex_symbol(lex_stream_t *stream, token_t *ret)
{ {
sv_t symbol = sv_chop_left(stream->contents, stream->byte); sv_t symbol = sv_chop_left(stream->contents, stream->byte);
symbol.size = sv_while(symbol, SYMBOL_CHARS); symbol.size = sv_while(symbol, SYMBOL_CHARS);
// see if symbol is one of the already known symbols // see if symbol is one of the already known symbols
static_assert(NUM_AST_KNOWNS == 1, "Expected number of AST_KNOWNs"); static_assert(NUM_TOKEN_KNOWNS == 1, "Expected number of TOKEN_KNOWNs");
for (ast_known_t i = 0; i < NUM_AST_KNOWNS; ++i) for (token_known_t i = 0; i < NUM_TOKEN_KNOWNS; ++i)
{ {
const char *possible_known = ast_known_to_cstr(i); const char *possible_known = token_known_to_cstr(i);
if (strlen(possible_known) == symbol.size && if (strlen(possible_known) == symbol.size &&
strncmp(possible_known, symbol.data, symbol.size) == 0) strncmp(possible_known, symbol.data, symbol.size) == 0)
{ {
// Found a matching known symbol // Found a matching known symbol
*ret = ast_node_known(stream->byte, i); *ret = token_known(stream->byte, i);
goto end; goto end;
} }
} }
// otherwise, it must be a fresh symbol i.e. user defined // otherwise, it must be a fresh symbol i.e. user defined
*ret = ast_node_symbol(stream->byte, symbol); *ret = token_symbol(stream->byte, symbol);
end: end:
stream_advance(stream, symbol.size); stream_advance(stream, symbol.size);
return PARSE_ERR_OK; return LEX_ERR_OK;
} }
bool stream_eos(parse_stream_t *stream) bool stream_eos(lex_stream_t *stream)
{ {
return stream->byte >= stream->contents.size; return stream->byte >= stream->contents.size;
} }
char stream_peek(parse_stream_t *stream) char stream_peek(lex_stream_t *stream)
{ {
if (stream_eos(stream)) if (stream_eos(stream))
return '\0'; return '\0';
@@ -158,7 +158,7 @@ char stream_peek(parse_stream_t *stream)
return stream->contents.data[stream->byte]; return stream->contents.data[stream->byte];
} }
void stream_advance(parse_stream_t *stream, u64 size) void stream_advance(lex_stream_t *stream, u64 size)
{ {
if (stream->byte + size >= stream->contents.size) if (stream->byte + size >= stream->contents.size)
stream->byte = stream->contents.size; stream->byte = stream->contents.size;
@@ -166,7 +166,7 @@ void stream_advance(parse_stream_t *stream, u64 size)
stream->byte += size; stream->byte += size;
} }
u64 stream_size(parse_stream_t *stream) u64 stream_size(lex_stream_t *stream)
{ {
return stream->contents.size; return stream->contents.size;
} }

115
src/lexer/token.c Normal file
View File

@@ -0,0 +1,115 @@
/* token.c: Implementation of TOKEN constructor/destructor functions
* Created: 2026-01-22
* Author: Aryadev Chavali
* License: See end of file
* Commentary: See /include/arl/lexer/token.h.
*/
#include <arl/lexer/token.h>
#include <arl/lib/base.h>
#include <arl/lib/vec.h>
const char *token_known_to_cstr(token_known_t known)
{
switch (known)
{
case TOKEN_KNOWN_PUTSTR:
return "putstr";
default:
FAIL("Unexpected TOKEN_KNOWN value: %d\n", known);
}
}
token_t token_known(u64 byte, token_known_t known)
{
return (token_t){
.byte_location = byte,
.type = TOKEN_TYPE_KNOWN,
.as_known = known,
};
}
token_t token_string(u64 byte, sv_t string)
{
return (token_t){
.byte_location = byte,
.type = TOKEN_TYPE_STRING,
.as_string = string,
};
}
token_t token_symbol(u64 byte, sv_t symbol)
{
return (token_t){
.byte_location = byte,
.type = TOKEN_TYPE_SYMBOL,
.as_symbol = symbol,
};
}
void token_print(FILE *fp, token_t *token)
{
if (!token)
{
fprintf(fp, "NIL");
return;
}
switch (token->type)
{
case TOKEN_TYPE_KNOWN:
fprintf(fp, "KNOWN(%s)", token_known_to_cstr(token->as_known));
break;
case TOKEN_TYPE_SYMBOL:
fprintf(fp, "SYMBOL(" PR_SV ")", SV_FMT(token->as_symbol));
break;
case TOKEN_TYPE_STRING:
fprintf(fp, "STRING(" PR_SV ")", SV_FMT(token->as_string));
break;
case NUM_TOKEN_TYPES:
default:
FAIL("Unexpected token type: %d\n", token->type);
}
}
void token_stream_print(FILE *fp, token_stream_t *token)
{
if (!token)
{
fprintf(fp, "{}");
return;
}
fprintf(fp, "{");
if (token->vec.size == 0)
{
fprintf(fp, "}\n");
return;
}
fprintf(fp, "\n");
for (u64 i = 0; i < token->vec.size / sizeof(token_t); ++i)
{
token_t item = VEC_GET(&token->vec, i, token_t);
fprintf(fp, "\t[%lu]: ", i);
token_print(fp, &item);
fprintf(fp, "\n");
}
fprintf(fp, "}");
}
void token_stream_free(token_stream_t *stream)
{
// we can free the vector itself and we're done
vec_free(&stream->vec);
}
/* Copyright (C) 2026 Aryadev Chavali
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the MIT License for details.
* You may distribute and modify this code under the terms of the MIT License,
* which you should have received a copy of along with this program. If not,
* please go to <https://opensource.org/license/MIT>.
*/

View File

@@ -12,11 +12,11 @@
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include <arl/lexer/lexer.h>
#include <arl/lexer/token.h>
#include <arl/lib/base.h> #include <arl/lib/base.h>
#include <arl/lib/sv.h> #include <arl/lib/sv.h>
#include <arl/lib/vec.h> #include <arl/lib/vec.h>
#include <arl/parser/ast.h>
#include <arl/parser/parser.h>
int read_file(const char *filename, sv_t *ret) int read_file(const char *filename, sv_t *ret)
{ {
@@ -114,30 +114,29 @@ int main(int argc, char *argv[])
LOG("%s => `" PR_SV "`\n", filename, SV_FMT(contents)); LOG("%s => `" PR_SV "`\n", filename, SV_FMT(contents));
parse_stream_t stream = {.byte = 0, .contents = contents}; lex_stream_t stream = {.byte = 0, .contents = contents};
ast_t ast = {0}; token_stream_t tokens = {0};
parse_err_t perr = parse(&ast, &stream); lex_err_t perr = lex_stream(&tokens, &stream);
if (perr) if (perr)
{ {
u64 line = 1, col = 0; u64 line = 1, col = 0;
parse_stream_get_line_col(&stream, &line, &col); lex_stream_get_line_col(&stream, &line, &col);
LOG_ERR("%s:%lu:%lu: %s\n", filename, line, col, parse_err_to_string(perr)); LOG_ERR("%s:%lu:%lu: %s\n", filename, line, col, lex_err_to_string(perr));
ret = 1; ret = 1;
goto end; goto end;
} }
LOG("Parsed %lu nodes\n", ast.nodes.size / sizeof(ast_node_t)); LOG("Lexed %lu tokens\n", tokens.vec.size / sizeof(token_t));
#if VERBOSE_LOGS #if VERBOSE_LOGS
ast_print(stdout, &ast); token_stream_print(stdout, &tokens);
#endif #endif
printf("\n"); printf("\n");
end: end:
if (contents.data) if (contents.data)
free(contents.data); free(contents.data);
if (ast.nodes.capacity > 0) token_stream_free(&tokens);
ast_free(&ast);
return ret; return ret;
} }

View File

@@ -1,115 +0,0 @@
/* ast.c: Implementation of AST constructor/destructor functions
* Created: 2026-01-22
* Author: Aryadev Chavali
* License: See end of file
* Commentary: See /include/arl/parser/ast.h.
*/
#include <arl/lib/base.h>
#include <arl/lib/vec.h>
#include <arl/parser/ast.h>
const char *ast_known_to_cstr(ast_known_t known)
{
switch (known)
{
case AST_KNOWN_PUTSTR:
return "putstr";
default:
FAIL("Unexpected AST_KNOWN value: %d\n", known);
}
}
ast_node_t ast_node_known(u64 byte, ast_known_t known)
{
return (ast_node_t){
.byte_location = byte,
.type = AST_NODE_TYPE_KNOWN,
.as_known = known,
};
}
ast_node_t ast_node_string(u64 byte, sv_t string)
{
return (ast_node_t){
.byte_location = byte,
.type = AST_NODE_TYPE_STRING,
.as_string = string,
};
}
ast_node_t ast_node_symbol(u64 byte, sv_t symbol)
{
return (ast_node_t){
.byte_location = byte,
.type = AST_NODE_TYPE_SYMBOL,
.as_symbol = symbol,
};
}
void ast_node_print(FILE *fp, ast_node_t *node)
{
if (!node)
{
fprintf(fp, "NIL");
return;
}
switch (node->type)
{
case AST_NODE_TYPE_KNOWN:
fprintf(fp, "KNOWN(%s)", ast_known_to_cstr(node->as_known));
break;
case AST_NODE_TYPE_SYMBOL:
fprintf(fp, "SYMBOL(" PR_SV ")", SV_FMT(node->as_symbol));
break;
case AST_NODE_TYPE_STRING:
fprintf(fp, "STRING(" PR_SV ")", SV_FMT(node->as_string));
break;
case NUM_AST_NODE_TYPES:
default:
FAIL("Unexpected node type: %d\n", node->type);
}
}
void ast_print(FILE *fp, ast_t *ast)
{
if (!ast)
{
fprintf(fp, "{}");
return;
}
fprintf(fp, "{");
if (ast->nodes.size == 0)
{
fprintf(fp, "}\n");
return;
}
fprintf(fp, "\n");
for (u64 i = 0; i < ast->nodes.size / sizeof(ast_node_t); ++i)
{
ast_node_t item = VEC_GET(&ast->nodes, i, ast_node_t);
fprintf(fp, "\t[%lu]: ", i);
ast_node_print(fp, &item);
fprintf(fp, "\n");
}
fprintf(fp, "}");
}
void ast_free(ast_t *ast)
{
// we can free the vector itself and we're done
vec_free(&ast->nodes);
}
/* Copyright (C) 2026 Aryadev Chavali
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the MIT License for details.
* You may distribute and modify this code under the terms of the MIT License,
* which you should have received a copy of along with this program. If not,
* please go to <https://opensource.org/license/MIT>.
*/