parser -> lexer

That's the real purpose of this module; it's not really generating an
AST since ARL's syntax isn't tree like whatsoever.

The next stage will be something closer to an AST, in the sense we'll
be introducing:
- Syntactical analysis
- Type Checking
This commit is contained in:
2026-01-29 03:43:04 +00:00
parent 42ac4f6bbb
commit dc96e12145
8 changed files with 259 additions and 261 deletions

184
src/lexer/lexer.c Normal file
View File

@@ -0,0 +1,184 @@
/* lexr.c: Implementation of lexr.
* Created: 2026-01-22
* Author: Aryadev Chavali
* License: See end of file
* Commentary: See /include/arl/lexr/lexr.h
*/
#include <ctype.h>
#include <string.h>
#include <arl/lexer/lexer.h>
#include <arl/lexer/token.h>
#include <arl/lib/sv.h>
/// Expected characters in a symbol
static const char *SYMBOL_CHARS =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&'()*+,-./"
":;<=>?@\\^_`{|}~0123456789";
const char *lex_err_to_string(lex_err_t err)
{
switch (err)
{
case LEX_ERR_OK:
return "OK";
case LEX_ERR_EXPECTED_SPEECH_MARKS:
return "EXPECTED_SPEECH_MARKS";
case LEX_ERR_UNKNOWN_CHAR:
return "UNKNOWN_CHAR";
default:
FAIL("Unexpected lex_err_t value: %d\n", err);
}
}
/// Prototypes for streams
bool stream_eos(lex_stream_t *stream);
char stream_peek(lex_stream_t *stream);
void stream_advance(lex_stream_t *stream, u64 size);
u64 stream_size(lex_stream_t *stream);
void lex_stream_get_line_col(lex_stream_t *stream, u64 *line, u64 *col)
{
assert(stream && line && col && "Expected valid pointers.");
for (u64 i = 0; i < stream->byte; ++i)
{
char c = stream->contents.data[i];
if (c == '\n')
{
*line += 1;
*col = 0;
}
else
{
*col += 1;
}
}
}
/// Prototypes for lexing subroutines
lex_err_t lex_string(lex_stream_t *stream, token_t *ret);
lex_err_t lex_symbol(lex_stream_t *stream, token_t *ret);
lex_err_t lex_stream(token_stream_t *out, lex_stream_t *stream)
{
assert(out && stream && "Expected valid pointers");
while (!stream_eos(stream))
{
char cur = stream_peek(stream);
if (isspace(cur))
{
while (isspace(cur) && !stream_eos(stream))
{
stream_advance(stream, 1);
cur = stream_peek(stream);
}
}
else if (cur == '"')
{
// we make a copy for lex_string to mess with
token_t ret = {0};
lex_err_t perr = lex_string(stream, &ret);
if (perr)
return perr;
vec_append(&out->vec, &ret, sizeof(ret));
}
else if (strchr(SYMBOL_CHARS, cur) && !isdigit(cur))
{
// we make a copy for lex_symbol to mess with
token_t ret = {0};
lex_err_t perr = lex_symbol(stream, &ret);
if (perr)
return perr;
vec_append(&out->vec, &ret, sizeof(ret));
}
else
{
return LEX_ERR_UNKNOWN_CHAR;
}
}
return LEX_ERR_OK;
}
lex_err_t lex_string(lex_stream_t *stream, token_t *ret)
{
// Increment the cursor just past the first speechmark
stream_advance(stream, 1);
sv_t string = sv_chop_left(stream->contents, stream->byte);
string.size = sv_till(string, "\"");
// If we're at the edge of the stream, there must not have been any
// speechmarks.
if (string.size + stream->byte == stream_size(stream))
return LEX_ERR_EXPECTED_SPEECH_MARKS;
// `string` is well defined, package and throw it back.
*ret = token_string(stream->byte - 1, string);
stream_advance(stream, string.size + 1);
return LEX_ERR_OK;
}
lex_err_t lex_symbol(lex_stream_t *stream, token_t *ret)
{
sv_t symbol = sv_chop_left(stream->contents, stream->byte);
symbol.size = sv_while(symbol, SYMBOL_CHARS);
// see if symbol is one of the already known symbols
static_assert(NUM_TOKEN_KNOWNS == 1, "Expected number of TOKEN_KNOWNs");
for (token_known_t i = 0; i < NUM_TOKEN_KNOWNS; ++i)
{
const char *possible_known = token_known_to_cstr(i);
if (strlen(possible_known) == symbol.size &&
strncmp(possible_known, symbol.data, symbol.size) == 0)
{
// Found a matching known symbol
*ret = token_known(stream->byte, i);
goto end;
}
}
// otherwise, it must be a fresh symbol i.e. user defined
*ret = token_symbol(stream->byte, symbol);
end:
stream_advance(stream, symbol.size);
return LEX_ERR_OK;
}
bool stream_eos(lex_stream_t *stream)
{
return stream->byte >= stream->contents.size;
}
char stream_peek(lex_stream_t *stream)
{
if (stream_eos(stream))
return '\0';
else
return stream->contents.data[stream->byte];
}
void stream_advance(lex_stream_t *stream, u64 size)
{
if (stream->byte + size >= stream->contents.size)
stream->byte = stream->contents.size;
else
stream->byte += size;
}
u64 stream_size(lex_stream_t *stream)
{
return stream->contents.size;
}
/* Copyright (C) 2026 Aryadev Chavali
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the MIT License for details.
* You may distribute and modify this code under the terms of the MIT License,
* which you should have received a copy of along with this program. If not,
* please go to <https://opensource.org/license/MIT>.
*/

115
src/lexer/token.c Normal file
View File

@@ -0,0 +1,115 @@
/* token.c: Implementation of TOKEN constructor/destructor functions
* Created: 2026-01-22
* Author: Aryadev Chavali
* License: See end of file
* Commentary: See /include/arl/lexer/token.h.
*/
#include <arl/lexer/token.h>
#include <arl/lib/base.h>
#include <arl/lib/vec.h>
const char *token_known_to_cstr(token_known_t known)
{
switch (known)
{
case TOKEN_KNOWN_PUTSTR:
return "putstr";
default:
FAIL("Unexpected TOKEN_KNOWN value: %d\n", known);
}
}
token_t token_known(u64 byte, token_known_t known)
{
return (token_t){
.byte_location = byte,
.type = TOKEN_TYPE_KNOWN,
.as_known = known,
};
}
token_t token_string(u64 byte, sv_t string)
{
return (token_t){
.byte_location = byte,
.type = TOKEN_TYPE_STRING,
.as_string = string,
};
}
token_t token_symbol(u64 byte, sv_t symbol)
{
return (token_t){
.byte_location = byte,
.type = TOKEN_TYPE_SYMBOL,
.as_symbol = symbol,
};
}
void token_print(FILE *fp, token_t *token)
{
if (!token)
{
fprintf(fp, "NIL");
return;
}
switch (token->type)
{
case TOKEN_TYPE_KNOWN:
fprintf(fp, "KNOWN(%s)", token_known_to_cstr(token->as_known));
break;
case TOKEN_TYPE_SYMBOL:
fprintf(fp, "SYMBOL(" PR_SV ")", SV_FMT(token->as_symbol));
break;
case TOKEN_TYPE_STRING:
fprintf(fp, "STRING(" PR_SV ")", SV_FMT(token->as_string));
break;
case NUM_TOKEN_TYPES:
default:
FAIL("Unexpected token type: %d\n", token->type);
}
}
void token_stream_print(FILE *fp, token_stream_t *token)
{
if (!token)
{
fprintf(fp, "{}");
return;
}
fprintf(fp, "{");
if (token->vec.size == 0)
{
fprintf(fp, "}\n");
return;
}
fprintf(fp, "\n");
for (u64 i = 0; i < token->vec.size / sizeof(token_t); ++i)
{
token_t item = VEC_GET(&token->vec, i, token_t);
fprintf(fp, "\t[%lu]: ", i);
token_print(fp, &item);
fprintf(fp, "\n");
}
fprintf(fp, "}");
}
void token_stream_free(token_stream_t *stream)
{
// we can free the vector itself and we're done
vec_free(&stream->vec);
}
/* Copyright (C) 2026 Aryadev Chavali
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the MIT License for details.
* You may distribute and modify this code under the terms of the MIT License,
* which you should have received a copy of along with this program. If not,
* please go to <https://opensource.org/license/MIT>.
*/