429 lines
9.7 KiB
C
429 lines
9.7 KiB
C
/* Copyright (C) 2025 Aryadev Chavali
|
|
|
|
* This program is distributed in the hope that it will be useful, but WITHOUT
|
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
|
* FOR A PARTICULAR PURPOSE. See the GNU General Public License Version 2 for
|
|
* details.
|
|
|
|
* You may distribute and modify this code under the terms of the GNU General
|
|
* Public License Version 2, which you should have received a copy of along with
|
|
* this program. If not, please go to <https://www.gnu.org/licenses/>.
|
|
|
|
* Created: 2025-04-16
|
|
* Description: Implementation of parser
|
|
*/
|
|
|
|
#include <lisp/reader.h>
|
|
|
|
#include <ctype.h>
|
|
#include <string.h>
|
|
|
|
bool is_digit(char c)
|
|
{
|
|
return isdigit(c);
|
|
}
|
|
|
|
bool is_alpha(char c)
|
|
{
|
|
return isalpha(c);
|
|
}
|
|
|
|
bool is_space(char c)
|
|
{
|
|
return isspace(c);
|
|
}
|
|
|
|
bool is_skip(char c)
|
|
{
|
|
return is_space(c) || c == ';';
|
|
}
|
|
|
|
bool is_sym(char c)
|
|
{
|
|
return strchr(SYM_CHARS, c) != NULL;
|
|
}
|
|
|
|
void input_from_sv(context_t *ctx, input_t *inp, const char *name, sv_t sv)
|
|
{
|
|
inp->name = name;
|
|
inp->str = sv_copy(&ctx->read, sv);
|
|
}
|
|
|
|
void input_from_fp(context_t *ctx, input_t *input, const char *name, FILE *fp)
|
|
{
|
|
input->name = name;
|
|
// TODO: Choose a best fit (i.e. maximal capacity, unused) page
|
|
page_t *page = page_create(INPUT_CHUNK_SIZE);
|
|
// chunk should be in scratch space so we can reset it later.
|
|
char *chunk = context_salloc(ctx, INPUT_CHUNK_SIZE);
|
|
|
|
u64 total_size = 0, size_read = 0;
|
|
while (!feof(fp))
|
|
{
|
|
size_read = fread(chunk, 1, INPUT_CHUNK_SIZE, fp);
|
|
if (size_read > 0)
|
|
{
|
|
page_rappend(&page, chunk, size_read);
|
|
total_size += size_read;
|
|
}
|
|
}
|
|
|
|
input->str = SV((char *)page->data, total_size);
|
|
|
|
// Memory cleanup
|
|
context_reset_scratch(ctx);
|
|
arena_attach(&ctx->read, page);
|
|
}
|
|
|
|
bool input_eof(input_t *input)
|
|
{
|
|
return !input || (input->offset >= input->str.size) ||
|
|
(input->str.data[input->offset] == '\0');
|
|
}
|
|
|
|
char input_peek(input_t *input, u64 offset)
|
|
{
|
|
if (input_eof(input) || input->offset + offset >= input->str.size)
|
|
return '\0';
|
|
return input->str.data[input->offset + offset];
|
|
}
|
|
|
|
char input_next(input_t *input, u64 offset)
|
|
{
|
|
if (input_eof(input) || input->offset + offset >= input->str.size)
|
|
return '\0';
|
|
input->offset += offset;
|
|
return input->str.data[input->offset];
|
|
}
|
|
|
|
void input_skip(input_t *inp)
|
|
{
|
|
while (!input_eof(inp))
|
|
{
|
|
sv_t current = sv_cut(inp->str, inp->offset);
|
|
// Skip any whitespace
|
|
inp->offset += sv_while(current, is_space);
|
|
current = sv_cut(inp->str, inp->offset);
|
|
// Is there a comment to skip?
|
|
if (current.size && current.data[0] == ';')
|
|
{
|
|
// Skip till newline
|
|
i64 newline = sv_find_subcstr(current, "\n", 1);
|
|
if (newline < 0)
|
|
inp->offset = inp->str.size;
|
|
else
|
|
inp->offset += newline + 1;
|
|
// Then skip any whitespace
|
|
}
|
|
// Multiline comment to skip?
|
|
else if (current.size > 2 && strncmp(current.data, "#|", 2) == 0)
|
|
{
|
|
i64 offset = sv_find_subcstr(current, "|#", 2);
|
|
if (offset < 0)
|
|
inp->offset = inp->str.size;
|
|
else
|
|
inp->offset += offset + 2;
|
|
// Then skip any whitespace
|
|
}
|
|
// Nothing to skip, stop.
|
|
else
|
|
break;
|
|
}
|
|
return;
|
|
}
|
|
|
|
perr_t parse_int(context_t *ctx, input_t *inp, lisp_t **ret)
|
|
{
|
|
debug("parse_int[%lu] => ", inp->offset);
|
|
|
|
// TODO: Parse arbitrary sized integers
|
|
(void)ctx;
|
|
|
|
bool negative = (input_peek(inp, 0) == '-');
|
|
sv_t current = sv_cut(inp->str, inp->offset + (negative ? 1 : 0));
|
|
sv_t digits = sv_chop(current, sv_while(current, is_digit));
|
|
|
|
debug("`" PR_SV "` => ", SV_FMT(digits));
|
|
i64 x = (negative ? -1L : 1L) * strtol(digits.data, NULL, 10);
|
|
debug("%ld\n", x);
|
|
|
|
input_next(inp, digits.size + (negative ? 1 : 0));
|
|
|
|
*ret = make_int(x);
|
|
return PERR_OK;
|
|
}
|
|
|
|
perr_t parse_sym(context_t *ctx, input_t *inp, lisp_t **ret)
|
|
{
|
|
debug("parse_sym[%lu] => ", inp->offset);
|
|
|
|
sv_t current = sv_cut(inp->str, inp->offset);
|
|
sv_t sym = sv_chop(current, sv_while(current, is_sym));
|
|
debug("`" PR_SV "` => ", SV_FMT(sym));
|
|
|
|
if (sym.size == 3)
|
|
{
|
|
// NOTE: We can't mutate sym directly because it's on `read` space.
|
|
|
|
// TODO: Make this beautiful please.
|
|
char buf[3];
|
|
for (u64 i = 0; i < 3; ++i)
|
|
buf[i] = toupper(sym.data[i]);
|
|
|
|
// NOTE: NIL symbol to actual NIL
|
|
if (strncmp(buf, "NIL", 3) == 0)
|
|
{
|
|
input_next(inp, 3);
|
|
*ret = NIL;
|
|
return PERR_OK;
|
|
}
|
|
}
|
|
|
|
*ret = make_sym(ctx, sym.data, sym.size);
|
|
debug("%#16lx\n", *ret);
|
|
input_next(inp, sym.size);
|
|
|
|
return PERR_OK;
|
|
}
|
|
|
|
perr_t parse_bool(context_t *ctx, input_t *inp, lisp_t **ret)
|
|
{
|
|
(void)ctx;
|
|
debug("parse_bool[%lu] => ", inp->offset);
|
|
char c = input_peek(inp, 1);
|
|
bool b = -1;
|
|
if (c == 't')
|
|
b = true;
|
|
else if (c == 'f')
|
|
b = false;
|
|
else
|
|
return PERR_EXPECTED_BOOLEAN;
|
|
*ret = tag_bool(b);
|
|
input_next(inp, 2);
|
|
return PERR_OK;
|
|
}
|
|
|
|
perr_t parse_cons(context_t *ctx, input_t *inp, lisp_t **ret)
|
|
{
|
|
// TODO: Put this in a symbol table
|
|
lisp_t *lisp_dot = make_sym(ctx, ".", 1);
|
|
debug("parse_cons[%lu] => (\n", inp->offset);
|
|
inp->offset += 1;
|
|
|
|
lisp_t *root = NIL;
|
|
lisp_t **cur = NIL;
|
|
bool dotted = false;
|
|
|
|
while (!input_eof(inp) && input_peek(inp, 0) != ')')
|
|
{
|
|
lisp_t *lisp = NIL;
|
|
perr_t res = parse(ctx, inp, &lisp);
|
|
if (res)
|
|
return res;
|
|
|
|
// This is cheap to do
|
|
if (lisp == lisp_dot)
|
|
{
|
|
dotted = true;
|
|
continue;
|
|
}
|
|
|
|
if (!root)
|
|
{
|
|
root = make_cons(ctx, lisp, NIL);
|
|
cur = &root;
|
|
}
|
|
else if (!dotted)
|
|
*cur = make_cons(ctx, lisp, NIL);
|
|
else
|
|
*cur = lisp;
|
|
|
|
if (cur && !dotted)
|
|
cur = &as_cons(*cur)->cdr;
|
|
|
|
input_skip(inp);
|
|
}
|
|
|
|
if (input_peek(inp, 0) != ')')
|
|
return PERR_EXPECTED_CLOSE_BRACKET;
|
|
|
|
input_next(inp, 1);
|
|
|
|
debug(")\n");
|
|
*ret = root;
|
|
return PERR_OK;
|
|
}
|
|
|
|
perr_t parse_vec(context_t *ctx, input_t *inp, lisp_t **ret)
|
|
{
|
|
debug("parse_vec[%lu] => [\n", inp->offset);
|
|
input_next(inp, 2);
|
|
|
|
lisp_t *lvec = make_vec(ctx, 0);
|
|
vec_t *vec = as_vec(lvec);
|
|
|
|
while (!input_eof(inp) && input_peek(inp, 0) != ')')
|
|
{
|
|
lisp_t *lisp = NIL;
|
|
perr_t res = parse(ctx, inp, &lisp);
|
|
if (res)
|
|
return res;
|
|
|
|
vec_append(&ctx->memory, vec, &lisp, sizeof(lisp));
|
|
input_skip(inp);
|
|
}
|
|
|
|
if (input_peek(inp, 0) != ')')
|
|
return PERR_EXPECTED_CLOSE_BRACKET;
|
|
|
|
input_next(inp, 1);
|
|
|
|
debug("]\n");
|
|
*ret = lvec;
|
|
return PERR_OK;
|
|
}
|
|
|
|
perr_t parse_str(context_t *ctx, input_t *inp, lisp_t **ret)
|
|
{
|
|
debug("parse_str[%lu] => ", inp->offset);
|
|
input_next(inp, 1); // 1 for the first speechmark
|
|
sv_t sv = sv_cut(inp->str, inp->offset);
|
|
i64 size = sv_find_subcstr(sv, "\"", 1);
|
|
if (size < 0)
|
|
return PERR_EXPECTED_SPEECH_MARK;
|
|
|
|
input_next(inp, size + 1); // 1 for that last speechmark
|
|
sv_t str_content = sv_chop(sv, size);
|
|
debug("\"" PR_SV "\" => ", SV_FMT(str_content));
|
|
*ret = make_str(ctx, str_content.data, str_content.size);
|
|
debug("%#16lx\n", *ret);
|
|
return PERR_OK;
|
|
}
|
|
|
|
perr_t parse_quote(context_t *ctx, input_t *inp, lisp_t **ret)
|
|
{
|
|
char c = input_peek(inp, 0);
|
|
if (!(c == '\'' || c == '`'))
|
|
return PERR_UNEXPECTED_CHAR;
|
|
input_next(inp, 1);
|
|
sv_t prefix = {0};
|
|
if (c == '\'')
|
|
prefix = SV("quote", 5);
|
|
else if (c == '`')
|
|
prefix = SV("quasiquote", 10);
|
|
lisp_t *root = make_cons(ctx, make_sym(ctx, prefix.data, prefix.size), NIL);
|
|
lisp_t *rest = NIL;
|
|
perr_t perr = parse(ctx, inp, &rest);
|
|
if (perr)
|
|
return perr;
|
|
CDR(root) = make_cons(ctx, rest, NIL);
|
|
*ret = root;
|
|
return PERR_OK;
|
|
}
|
|
|
|
// TODO: Make this interactable with user once we have evaluation
|
|
perr_t parse_reader_macro(context_t *ctx, input_t *inp, lisp_t **ret)
|
|
{
|
|
char c = input_peek(inp, 1);
|
|
if (c == '\\')
|
|
{
|
|
// character or weird base integer
|
|
TODO("Not implemented reader macro for characters or weird bases");
|
|
}
|
|
else if (c == '(')
|
|
return parse_vec(ctx, inp, ret);
|
|
else if (c == 't' || c == 'f')
|
|
return parse_bool(ctx, inp, ret);
|
|
else if (c == 'e')
|
|
{
|
|
// Scientific notation for floats
|
|
}
|
|
return PERR_UNEXPECTED_READER_MACRO_SYMBOL;
|
|
}
|
|
|
|
static_assert(NUM_TAGS == 9);
|
|
perr_t parse(context_t *ctx, input_t *inp, lisp_t **ret)
|
|
{
|
|
debug("parse => ");
|
|
input_skip(inp);
|
|
if (input_eof(inp))
|
|
return PERR_EOF;
|
|
|
|
char c = input_peek(inp, 0);
|
|
|
|
if (is_digit(c) || (c == '-' && is_digit(input_peek(inp, 1))))
|
|
return parse_int(ctx, inp, ret);
|
|
else if (c == '#')
|
|
return parse_reader_macro(ctx, inp, ret);
|
|
else if (is_sym(c))
|
|
return parse_sym(ctx, inp, ret);
|
|
else if (c == '(')
|
|
return parse_cons(ctx, inp, ret);
|
|
else if (c == '\'' || c == '`')
|
|
return parse_quote(ctx, inp, ret);
|
|
else if (c == '\"')
|
|
return parse_str(ctx, inp, ret);
|
|
else
|
|
return PERR_UNEXPECTED_CHAR;
|
|
}
|
|
|
|
perr_t parse_all(context_t *ctx, input_t *inp, vec_t *vec)
|
|
{
|
|
while (!input_eof(inp))
|
|
{
|
|
lisp_t *member = NIL;
|
|
perr_t err = parse(ctx, inp, &member);
|
|
|
|
if (err)
|
|
return err;
|
|
else
|
|
vec_append(&ctx->scratch, vec, &member, sizeof(member));
|
|
|
|
input_skip(inp);
|
|
}
|
|
return PERR_OK;
|
|
}
|
|
|
|
int print_perror(FILE *fp, input_t *inp, perr_t error)
|
|
{
|
|
pos_t pos = input_offset_to_pos(inp);
|
|
fprintf(fp, "%s:%lu:%lu: %s", inp->name, pos.line, pos.col,
|
|
perr_to_cstr(error));
|
|
switch (error)
|
|
{
|
|
case PERR_UNEXPECTED_CHAR:
|
|
fprintf(fp, "(`%c`)", input_peek(inp, 0));
|
|
break;
|
|
case PERR_OK:
|
|
case PERR_EOF:
|
|
case PERR_EXPECTED_BOOLEAN:
|
|
case PERR_UNEXPECTED_READER_MACRO_SYMBOL:
|
|
case PERR_EXPECTED_CLOSE_BRACKET:
|
|
case PERR_EXPECTED_SPEECH_MARK:
|
|
default:
|
|
break;
|
|
}
|
|
fprintf(stderr, "\n");
|
|
|
|
return error;
|
|
}
|
|
|
|
pos_t input_offset_to_pos(input_t *inp)
|
|
{
|
|
pos_t pos = {.col = 1, .line = 1};
|
|
for (u64 i = 0; i < inp->offset && i < inp->str.size; ++i)
|
|
{
|
|
char c = (inp->str.data[i]);
|
|
if (c == '\n')
|
|
{
|
|
++pos.line;
|
|
pos.col = 1;
|
|
}
|
|
else
|
|
{
|
|
++pos.col;
|
|
}
|
|
}
|
|
return pos;
|
|
}
|