/* Copyright (C) 2025 Aryadev Chavali * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU General Public License Version 2 for * details. * You may distribute and modify this code under the terms of the GNU General * Public License Version 2, which you should have received a copy of along with * this program. If not, please go to . * Created: 2025-04-16 * Description: Implementation of parser */ #include #include #include bool is_digit(char c) { return isdigit(c); } bool is_alpha(char c) { return isalpha(c); } bool is_space(char c) { return isspace(c); } bool is_skip(char c) { return is_space(c) || c == ';'; } bool is_sym(char c) { return strchr(SYM_CHARS, c) != NULL; } void input_from_sv(context_t *ctx, input_t *inp, const char *name, sv_t sv) { inp->name = name; inp->str = sv_copy(&ctx->read, sv); } void input_from_fp(context_t *ctx, input_t *input, const char *name, FILE *fp) { input->name = name; // TODO: Choose a best fit (i.e. maximal capacity, unused) page page_t *page = page_create(INPUT_CHUNK_SIZE); // chunk should be in scratch space so we can reset it later. char *chunk = context_salloc(ctx, INPUT_CHUNK_SIZE); u64 total_size = 0, size_read = 0; while (!feof(fp)) { size_read = fread(chunk, 1, INPUT_CHUNK_SIZE, fp); if (size_read > 0) { page_rappend(&page, chunk, size_read); total_size += size_read; } } input->str = SV((char *)page->data, total_size); // Memory cleanup context_reset_scratch(ctx); arena_attach(&ctx->read, page); } bool input_eof(input_t *input) { return !input || (input->offset >= input->str.size) || (input->str.data[input->offset] == '\0'); } char input_peek(input_t *input, u64 offset) { if (input_eof(input) || input->offset + offset >= input->str.size) return '\0'; return input->str.data[input->offset + offset]; } char input_next(input_t *input, u64 offset) { if (input_eof(input) || input->offset + offset >= input->str.size) return '\0'; input->offset += offset; return input->str.data[input->offset]; } void input_skip(input_t *inp) { while (!input_eof(inp)) { sv_t current = sv_cut(inp->str, inp->offset); // Skip any whitespace inp->offset += sv_while(current, is_space); current = sv_cut(inp->str, inp->offset); // Is there a comment to skip? if (current.size && current.data[0] == ';') { // Skip till newline i64 newline = sv_find_subcstr(current, "\n", 1); if (newline < 0) inp->offset = inp->str.size; else inp->offset += newline + 1; // Then skip any whitespace } // Multiline comment to skip? else if (current.size > 2 && strncmp(current.data, "#|", 2) == 0) { i64 offset = sv_find_subcstr(current, "|#", 2); if (offset < 0) inp->offset = inp->str.size; else inp->offset += offset + 2; // Then skip any whitespace } // Nothing to skip, stop. else break; } return; } perr_t parse_int(context_t *ctx, input_t *inp, lisp_t **ret) { debug("parse_int[%lu] => ", inp->offset); // TODO: Parse arbitrary sized integers (void)ctx; bool negative = (input_peek(inp, 0) == '-'); sv_t current = sv_cut(inp->str, inp->offset + (negative ? 1 : 0)); sv_t digits = sv_chop(current, sv_while(current, is_digit)); debug("`" PR_SV "` => ", SV_FMT(digits)); i64 x = (negative ? -1L : 1L) * strtol(digits.data, NULL, 10); debug("%ld\n", x); input_next(inp, digits.size + (negative ? 1 : 0)); *ret = make_int(x); return PERR_OK; } perr_t parse_sym(context_t *ctx, input_t *inp, lisp_t **ret) { debug("parse_sym[%lu] => ", inp->offset); sv_t current = sv_cut(inp->str, inp->offset); sv_t sym = sv_chop(current, sv_while(current, is_sym)); debug("`" PR_SV "` => ", SV_FMT(sym)); if (sym.size == 3) { // NOTE: We can't mutate sym directly because it's on `read` space. // TODO: Make this beautiful please. char buf[3]; for (u64 i = 0; i < 3; ++i) buf[i] = toupper(sym.data[i]); // NOTE: NIL symbol to actual NIL if (strncmp(buf, "NIL", 3) == 0) { input_next(inp, 3); *ret = NIL; return PERR_OK; } } *ret = make_sym(ctx, sym.data, sym.size); debug("%#16lx\n", *ret); input_next(inp, sym.size); return PERR_OK; } perr_t parse_bool(context_t *ctx, input_t *inp, lisp_t **ret) { (void)ctx; debug("parse_bool[%lu] => ", inp->offset); char c = input_peek(inp, 1); bool b = -1; if (c == 't') b = true; else if (c == 'f') b = false; else return PERR_EXPECTED_BOOLEAN; *ret = tag_bool(b); input_next(inp, 2); return PERR_OK; } perr_t parse_cons(context_t *ctx, input_t *inp, lisp_t **ret) { // TODO: Put this in a symbol table lisp_t *lisp_dot = make_sym(ctx, ".", 1); debug("parse_cons[%lu] => (\n", inp->offset); inp->offset += 1; lisp_t *root = NIL; lisp_t **cur = NIL; bool dotted = false; while (!input_eof(inp) && input_peek(inp, 0) != ')') { lisp_t *lisp = NIL; perr_t res = parse(ctx, inp, &lisp); if (res) return res; // This is cheap to do if (lisp == lisp_dot) { dotted = true; continue; } if (!root) { root = make_cons(ctx, lisp, NIL); cur = &root; } else if (!dotted) *cur = make_cons(ctx, lisp, NIL); else *cur = lisp; if (cur && !dotted) cur = &as_cons(*cur)->cdr; input_skip(inp); } if (input_peek(inp, 0) != ')') return PERR_EXPECTED_CLOSE_BRACKET; input_next(inp, 1); debug(")\n"); *ret = root; return PERR_OK; } perr_t parse_vec(context_t *ctx, input_t *inp, lisp_t **ret) { debug("parse_vec[%lu] => [\n", inp->offset); input_next(inp, 2); lisp_t *lvec = make_vec(ctx, 0); vec_t *vec = as_vec(lvec); while (!input_eof(inp) && input_peek(inp, 0) != ')') { lisp_t *lisp = NIL; perr_t res = parse(ctx, inp, &lisp); if (res) return res; vec_append(&ctx->memory, vec, &lisp, sizeof(lisp)); input_skip(inp); } if (input_peek(inp, 0) != ')') return PERR_EXPECTED_CLOSE_BRACKET; input_next(inp, 1); debug("]\n"); *ret = lvec; return PERR_OK; } perr_t parse_str(context_t *ctx, input_t *inp, lisp_t **ret) { debug("parse_str[%lu] => ", inp->offset); input_next(inp, 1); // 1 for the first speechmark sv_t sv = sv_cut(inp->str, inp->offset); i64 size = sv_find_subcstr(sv, "\"", 1); if (size < 0) return PERR_EXPECTED_SPEECH_MARK; input_next(inp, size + 1); // 1 for that last speechmark sv_t str_content = sv_chop(sv, size); debug("\"" PR_SV "\" => ", SV_FMT(str_content)); *ret = make_str(ctx, str_content.data, str_content.size); debug("%#16lx\n", *ret); return PERR_OK; } perr_t parse_quote(context_t *ctx, input_t *inp, lisp_t **ret) { char c = input_peek(inp, 0); if (!(c == '\'' || c == '`')) return PERR_UNEXPECTED_CHAR; input_next(inp, 1); sv_t prefix = {0}; if (c == '\'') prefix = SV("quote", 5); else if (c == '`') prefix = SV("quasiquote", 10); lisp_t *root = make_cons(ctx, make_sym(ctx, prefix.data, prefix.size), NIL); lisp_t *rest = NIL; perr_t perr = parse(ctx, inp, &rest); if (perr) return perr; CDR(root) = make_cons(ctx, rest, NIL); *ret = root; return PERR_OK; } // TODO: Make this interactable with user once we have evaluation perr_t parse_reader_macro(context_t *ctx, input_t *inp, lisp_t **ret) { char c = input_peek(inp, 1); if (c == '\\') { // character or weird base integer TODO("Not implemented reader macro for characters or weird bases"); } else if (c == '(') return parse_vec(ctx, inp, ret); else if (c == 't' || c == 'f') return parse_bool(ctx, inp, ret); else if (c == 'e') { // Scientific notation for floats } return PERR_UNEXPECTED_READER_MACRO_SYMBOL; } static_assert(NUM_TAGS == 9); perr_t parse(context_t *ctx, input_t *inp, lisp_t **ret) { debug("parse => "); input_skip(inp); if (input_eof(inp)) return PERR_EOF; char c = input_peek(inp, 0); if (is_digit(c) || (c == '-' && is_digit(input_peek(inp, 1)))) return parse_int(ctx, inp, ret); else if (c == '#') return parse_reader_macro(ctx, inp, ret); else if (is_sym(c)) return parse_sym(ctx, inp, ret); else if (c == '(') return parse_cons(ctx, inp, ret); else if (c == '\'' || c == '`') return parse_quote(ctx, inp, ret); else if (c == '\"') return parse_str(ctx, inp, ret); else return PERR_UNEXPECTED_CHAR; } perr_t parse_all(context_t *ctx, input_t *inp, vec_t *vec) { while (!input_eof(inp)) { lisp_t *member = NIL; perr_t err = parse(ctx, inp, &member); if (err) return err; else vec_append(&ctx->scratch, vec, &member, sizeof(member)); input_skip(inp); } return PERR_OK; } int print_perror(FILE *fp, input_t *inp, perr_t error) { pos_t pos = input_offset_to_pos(inp); fprintf(fp, "%s:%lu:%lu: %s", inp->name, pos.line, pos.col, perr_to_cstr(error)); switch (error) { case PERR_UNEXPECTED_CHAR: fprintf(fp, "(`%c`)", input_peek(inp, 0)); break; case PERR_OK: case PERR_EOF: case PERR_EXPECTED_BOOLEAN: case PERR_UNEXPECTED_READER_MACRO_SYMBOL: case PERR_EXPECTED_CLOSE_BRACKET: case PERR_EXPECTED_SPEECH_MARK: default: break; } fprintf(stderr, "\n"); return error; } pos_t input_offset_to_pos(input_t *inp) { pos_t pos = {.col = 1, .line = 1}; for (u64 i = 0; i < inp->offset && i < inp->str.size; ++i) { char c = (inp->str.data[i]); if (c == '\n') { ++pos.line; pos.col = 1; } else { ++pos.col; } } return pos; }