diff options
author | Aryadev Chavali <aryadev@aryadevchavali.com> | 2025-05-09 18:29:52 +0100 |
---|---|---|
committer | Aryadev Chavali <aryadev@aryadevchavali.com> | 2025-05-09 18:29:52 +0100 |
commit | ba5c0a4579ece5d53c009a14d00e683e70b982f4 (patch) | |
tree | ad7e6788b8ce634172f9a5cdee0a1a9ac08c7788 | |
parent | 576bf0f3085022e9117d78e3b4e19971c82a61d6 (diff) | |
download | oats-ba5c0a4579ece5d53c009a14d00e683e70b982f4.tar.gz oats-ba5c0a4579ece5d53c009a14d00e683e70b982f4.tar.bz2 oats-ba5c0a4579ece5d53c009a14d00e683e70b982f4.zip |
Initial implementation
-rw-r--r-- | Makefile | 51 | ||||
-rw-r--r-- | base.h | 78 | ||||
-rw-r--r-- | eval.c | 24 | ||||
-rw-r--r-- | eval.h | 28 | ||||
-rw-r--r-- | lisp.c | 317 | ||||
-rw-r--r-- | lisp.h | 59 | ||||
-rw-r--r-- | main.c | 92 | ||||
-rw-r--r-- | memory.c | 275 | ||||
-rw-r--r-- | memory.h | 66 | ||||
-rw-r--r-- | reader.c | 419 | ||||
-rw-r--r-- | reader.h | 98 | ||||
-rw-r--r-- | sv.c | 158 | ||||
-rw-r--r-- | sv.h | 70 | ||||
-rw-r--r-- | tag.c | 94 | ||||
-rw-r--r-- | tag.h | 90 | ||||
-rw-r--r-- | tasks.org | 230 | ||||
-rw-r--r-- | vec.c | 38 | ||||
-rw-r--r-- | vec.h | 30 |
18 files changed, 2217 insertions, 0 deletions
diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..c6d5504 --- /dev/null +++ b/Makefile @@ -0,0 +1,51 @@ +CC=gcc +OUT=oats.out +LIBS=-I. +ARGS= + +RELEASE=0 +GFLAGS=-Wall -Wextra -Wswitch-enum -Werror -std=c11 +DFLAGS=-ggdb -fsanitize=address -fsanitize=undefined +RFLAGS=-O3 +DEPFLAGS=-MT $@ -MMD -MP -MF + +ifeq ($(RELEASE), 1) +CFLAGS=$(GFLAGS) $(RFLAGS) -DDEBUG=0 +else +CFLAGS=$(GFLAGS) $(DFLAGS) +endif + +DIST=build +SRC:=tag.c memory.c vec.c sv.c lisp.c reader.c eval.c main.c +OBJ:=$(SRC:%.c=$(DIST)/%.o) +DEPDIR=$(DIST)/dependencies +DEPS:=$(SRC:%.c=$(DEPDIR)/%.d) + +.PHONY: all +all: $(DIST)/$(OUT) + +$(DIST)/%.o: %.c | $(DIST) $(DEPDIR) + $(CC) $(CFLAGS) $(DEPFLAGS) $(DEPDIR)/$*.d -c $< -o $@ $(LIBS) + +$(DIST)/$(OUT): $(OBJ) | $(DIST) + $(CC) $(CFLAGS) $^ -o $@ $(LIBS) + +.PHONY: run +run: $(DIST)/$(OUT) + ./$^ $(ARGS) + +.PHONY: +clean: + rm -rfv $(DIST)/* + +$(DIST): + mkdir -p $(DIST) + +$(DEPDIR): + mkdir -p $(DEPDIR) + +.PHONY: +watch: + find . -type 'f' -regex ".*.c\\|.*.h" | entr -cs "make run" + +-include $(DEPS) @@ -0,0 +1,78 @@ +/* Copyright (C) 2025 Aryadev Chavali + + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License Version 2 for + * details. + + * You may distribute and modify this code under the terms of the GNU General + * Public License Version 2, which you should have received a copy of along with + * this program. If not, please go to <https://www.gnu.org/licenses/>. + + * Created: 2025-03-31 + * Description: Base + */ + +#ifndef BASE_H +#define BASE_H + +#include <assert.h> +#include <stdarg.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> + +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; + +typedef int8_t i8; +typedef int16_t i16; +typedef int32_t i32; +typedef int64_t i64; + +typedef float f32; +typedef double f64; +static_assert(sizeof(f32) == sizeof(u32)); +static_assert(sizeof(f64) == sizeof(u64)); + +#define MAX(A, B) ((A) > (B) ? (A) : (B)) +#define MIN(A, B) ((A) < (B) ? (A) : (B)) +#define NTH_BYTE(X, N) (((X) >> (8 * N)) & ((1 << 8) - 1)) +#define ARR_SIZE(XS) (sizeof(XS) / sizeof((XS)[0])) + +#define TODO(MSG) (assert(false && MSG)); + +#ifndef DEBUG +#define DEBUG 2 +#endif + +static inline void debug(char *fmt, ...) +{ +#if DEBUG > 1 + va_list ap; + va_start(ap, fmt); + vprintf(fmt, ap); + va_end(ap); +#endif +} + +static inline void info(char *fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + vprintf(fmt, ap); + va_end(ap); +} + +static inline void print_bits(u64 w) +{ + for (u64 i = 8 * sizeof(w); i > 0; --i) + { + printf("%c", ((w >> (i - 1)) & 1) == 1 ? '1' : '0'); + } + printf("\n"); +} + +#endif @@ -0,0 +1,24 @@ +/* Copyright (C) 2025 Aryadev Chavali + + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License Version 2 for + * details. + + * You may distribute and modify this code under the terms of the GNU General + * Public License Version 2, which you should have received a copy of along with + * this program. If not, please go to <https://www.gnu.org/licenses/>. + + * Created: 2025-04-18 + * Description: Evaluator implementation + */ + +#include "./eval.h" + +err_t eval(context_t *ctx, lisp_t *obj, lisp_t **ret) +{ + (void)ctx; + (void)obj; + (void)ret; + TODO("implement evaluator"); +} @@ -0,0 +1,28 @@ +/* Copyright (C) 2025 Aryadev Chavali + + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License Version 2 for + * details. + + * You may distribute and modify this code under the terms of the GNU General + * Public License Version 2, which you should have received a copy of along with + * this program. If not, please go to <https://www.gnu.org/licenses/>. + + * Created: 2025-04-18 + * Description: Evaluator + */ + +#ifndef EVAL_H +#define EVAL_H + +#include "./lisp.h" + +typedef enum +{ + ERR_OK = 0, +} err_t; + +err_t eval(context_t *ctx, lisp_t *obj, lisp_t **ret); + +#endif @@ -0,0 +1,317 @@ +/* Copyright (C) 2025 Aryadev Chavali + + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License Version 2 for + * details. + + * You may distribute and modify this code under the terms of the GNU General + * Public License Version 2, which you should have received a copy of along with + * this program. If not, please go to <https://www.gnu.org/licenses/>. + + * Created: 2025-04-06 + * Description: + */ + +#include "./lisp.h" +#include "./sv.h" + +#include <stdio.h> +#include <string.h> +#include <wchar.h> + +lisp_t *make_int(i64 integer) +{ + return tag_int(integer); +} + +lisp_t *make_sym(context_t *ctx, char *data, u64 size) +{ + // Small symbol optimisation + if (size <= 7) + return tag_ssym(data, size); + + // Make a cell with the data we need + cell_t *cell = context_alloc(ctx, sizeof(*cell) + size); + cell->size = size; + memcpy(cell->data, data, size); + + return tag_sym(cell); +} + +lisp_t *make_cons(context_t *ctx, lisp_t *car, lisp_t *cdr) +{ + cons_t *cons = context_alloc(ctx, sizeof(*cons)); + memset(cons, 0, sizeof(*cons)); + cons->car = car; + cons->cdr = cdr; + return tag_cons(cons); +} + +lisp_t *make_list(context_t *ctx, lisp_t **lisps, u64 size) +{ + if (!lisps) + return NIL; + lisp_t *cur = NIL; + for (u64 i = size; i > 0; --i) + cur = make_cons(ctx, lisps[i - 1], cur); + return cur; +} + +lisp_t *make_vec(context_t *ctx, u32 size) +{ + // Make a vector with the content + lisp_t *container = NIL; + vec_t *vec = context_alloc(ctx, sizeof(*vec)); + vec->size = 0; + vec->cap = size * sizeof(&container); + if (size == 0) + vec->data = NULL; + else + vec->data = context_alloc(ctx, vec->cap); + container = tag_vec(vec); + + return container; +} + +lisp_t *make_str(context_t *ctx, char *data, u64 size) +{ + if (size == 0) + // No need to allocate unless necessary + return tag_str(NIL); + // Make a vector with the content + vec_t *vec = context_alloc(ctx, sizeof(*vec)); + vec->data = context_alloc(ctx, sizeof(*vec->data) * size); + vec->cap = size; + vec->size = size; + memcpy(vec->data, data, size); + + return tag_str(vec); +} + +i64 as_int(lisp_t *obj) +{ + assert(IS_TAG(obj, INT)); + u64 p_obj = (u64)obj; + return UNTAG(p_obj, INT) | // Delete the tag + (NTH_BYTE(p_obj, 7) & 0x80) << 56 // duplicate the MSB (preserve sign) + ; +} + +cons_t *as_cons(lisp_t *obj) +{ + assert(IS_TAG(obj, CONS)); + return (cons_t *)UNTAG(obj, CONS); +} + +u32 as_char(lisp_t *obj) +{ + return (u32)UNTAG(obj, CHAR); +} + +cell_t *as_sym(lisp_t *obj) +{ + assert(IS_TAG(obj, SYM)); + return (cell_t *)UNTAG(obj, SYM); +} + +void as_ssym(lisp_t *obj, sv_t *container) +{ + assert(IS_TAG(obj, SSYM)); + u64 p_obj = (u64)obj; + container->size = NTH_BYTE(p_obj, 0) >> SHIFT_SSYM; + p_obj >>= 8; + memcpy(container->data, &p_obj, container->size); +} + +bool as_bool(lisp_t *obj) +{ + assert(IS_TAG(obj, BOOL)); + return (bool)UNTAG(obj, BOOL); +} + +vec_t *as_vec(lisp_t *obj) +{ + assert(IS_TAG(obj, VEC)); + return (vec_t *)UNTAG(obj, VEC); +} + +vec_t *as_str(lisp_t *obj) +{ + assert(IS_TAG(obj, STR)); + return (vec_t *)UNTAG(obj, STR); +} + +sv_t serialise(context_t *ctx, lisp_t *ptr) +{ + enum Tag t = tag_get(ptr); + switch (t) + { + case TAG_NIL: + { + return SV("NIL", 3); + } + case TAG_INT: + { + i64 i = as_int(ptr); + sv_t s = sv_fmt(&ctx->scratch, +#if DEBUG > 1 + "int[" +#endif + "%ld" +#if DEBUG > 1 + "]" +#endif + , + i); + return s; + } + case TAG_CHAR: + { + u32 codepoint = as_char(ptr); + return sv_fmt(&ctx->scratch, +#if DEBUG > 1 + "char[" +#endif + "%lc" +#if DEBUG > 1 + "]" +#endif + , + codepoint); + } + case TAG_SYM: + { + cell_t *cell = as_sym(ptr); + sv_t s = sv_make(&ctx->scratch, (char *)cell->data, cell->size); +#if DEBUG > 1 + s = sv_fmt(&ctx->scratch, "sym[" PR_SV "]", SV_FMT(s)); +#endif + return s; + } + case TAG_SSYM: + { + char data[7]; + sv_t rsym = SV(data, 0); + as_ssym(ptr, &rsym); + sv_t s = sv_copy(&ctx->scratch, rsym); + +#if DEBUG > 1 + s = sv_fmt(&ctx->scratch, "ssym[" PR_SV "]", SV_FMT(s)); +#endif + return s; + } + case TAG_BOOL: + { + return sv_fmt(&ctx->scratch, +#if DEBUG > 1 + "bool[" +#endif + "%s" +#if DEBUG > 1 + "]" +#endif + , + as_bool(ptr) ? "#t" : "#f"); + } + case TAG_CONS: + { + if (!CAR(ptr) && !CDR(ptr)) +#if DEBUG > 1 + return SV("lst[NIL]", 8); +#else + return SV("NIL", 3); +#endif + + sv_t s = {0}; + + for (lisp_t *lsp = ptr; lsp; lsp = IS_TAG(lsp, CONS) ? CDR(lsp) : NIL) + { + lisp_t *lmember = lsp; + if (IS_TAG(lsp, CONS)) + lmember = CAR(lmember); + + sv_t member = serialise(ctx, lmember); + s = sv_concat(&ctx->scratch, s, member); + if (IS_TAG(lsp, CONS) && !CDR(lsp)) + continue; + else if (IS_TAG(CDR(lsp), CONS)) + // normal list + s = sv_append(&ctx->scratch, s, " ", 1); + else + // dotted list + s = sv_append(&ctx->scratch, s, " . ", 3); + } + +#if DEBUG > 1 + s = sv_fmt(&ctx->scratch, "lst[" PR_SV "]", SV_FMT(s)); +#else + s = sv_fmt(&ctx->scratch, "(" PR_SV ")", SV_FMT(s)); +#endif + return s; + } + case TAG_VEC: + { + vec_t *vec = as_vec(ptr); + if (!vec) +#if DEBUG > 1 + return SV("vec[NIL]", 8); +#else + return SV("#()", 3); +#endif + else if (vec->size < sizeof(&ptr)) +#if DEBUG > 1 + return SV("vec[0/0 #()]", 13); +#else + return SV("#()", 3); +#endif + + sv_t s = {0}; + for (u64 i = 0; i < vec->size / sizeof(&ptr); ++i) + { + lisp_t *lmember = ((lisp_t **)vec->data)[i]; + sv_t member = serialise(ctx, lmember); + s = sv_concat(&ctx->scratch, s, member); + if (i == ((vec->size / sizeof(&ptr)) - 1)) + continue; + s = sv_append(&ctx->scratch, s, " ", 1); + } + +#if DEBUG > 1 + s = sv_fmt(&ctx->scratch, "vec[%lu/%lu #(" PR_SV ")]", vec->size, vec->cap, + SV_FMT(s)); +#else + s = sv_fmt(&ctx->scratch, "#(" PR_SV ")", SV_FMT(s)); +#endif + return s; + break; + } + case TAG_STR: + { + vec_t *vec = as_str(ptr); + sv_t sv = {0}; + if (vec) + sv = SV((char *)vec->data, vec->size); + else + sv = SV("", 0); + return sv_fmt(&ctx->scratch, +#if DEBUG > 1 + "str[" +#else + "\"" +#endif + PR_SV +#if DEBUG > 1 + "]" +#else + "\"" +#endif + , + SV_FMT(sv)); + } + case NUM_TAGS: + default: + assert(false && "serialise: unreachable"); + return SV(0, 0); + } +} @@ -0,0 +1,59 @@ +/* Copyright (C) 2025 Aryadev Chavali + + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License Version 2 for + * details. + + * You may distribute and modify this code under the terms of the GNU General + * Public License Version 2, which you should have received a copy of along with + * this program. If not, please go to <https://www.gnu.org/licenses/>. + + * Created: 2025-04-06 + * Description: Object model where we deal with contexts + */ + +#ifndef LISP_H +#define LISP_H + +#include "./base.h" +#include "./memory.h" +#include "./sv.h" +#include "./tag.h" +#include "./vec.h" + +typedef struct Cons +{ + lisp_t *car, *cdr; +} cons_t; + +typedef struct Cell +{ + u64 size; + u8 data[]; +} cell_t; + +#define NIL 0 +lisp_t *make_int(i64 integer); +lisp_t *make_sym(context_t *ctx, char *data, u64 size); +lisp_t *make_cons(context_t *ctx, lisp_t *car, lisp_t *cdr); +lisp_t *make_list(context_t *ctx, lisp_t **lisps, u64 size); +lisp_t *make_vec(context_t *ctx, u32 size); +lisp_t *make_str(context_t *ctx, char *data, u64 size); + +i64 as_int(lisp_t *obj); +u32 as_char(lisp_t *obj); +cell_t *as_sym(lisp_t *obj); +void as_ssym(lisp_t *obj, sv_t *sv); +bool as_bool(lisp_t *obj); +f64 as_float(lisp_t *obj); +cons_t *as_cons(lisp_t *obj); +vec_t *as_vec(lisp_t *obj); +vec_t *as_str(lisp_t *obj); + +sv_t serialise(context_t *ctx, lisp_t *lisp); + +#define CAR(PTR) (as_cons(PTR)->car) +#define CDR(PTR) (as_cons(PTR)->cdr) + +#endif @@ -0,0 +1,92 @@ +/* Copyright (C) 2025 Aryadev Chavali + + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License Version 2 for + * details. + + * You may distribute and modify this code under the terms of the GNU General + * Public License Version 2, which you should have received a copy of along with + * this program. If not, please go to <https://www.gnu.org/licenses/>. + + * Created: 2025-04-05 + * Description: Entrypoint + */ + +#include "./base.h" +#include "./lisp.h" +#include "./memory.h" +#include "./reader.h" +#include "./sv.h" +#include "./vec.h" + +#include <assert.h> +#include <ctype.h> +#include <malloc.h> +#include <stdarg.h> +#include <stdio.h> +#include <string.h> + +int main(int argc, char *argv[]) +{ + int exit = 0; + context_t ctx = {0}; + const char *filename = NULL; +#if 1 + if (argc > 1) + filename = argv[1]; + else + filename = "./r7rs-tests.scm"; + + FILE *fp = fopen(filename, "r"); + if (!fp) + { + fprintf(stderr, "[ERROR]: File `%s` does not exist\n", filename); + exit = 1; + goto end; + } + input_t inp = {0}; + input_from_fp(&ctx, &inp, filename, fp); + fclose(fp); +#else + filename = "<example>"; + char inp_data[] = "(print (+ 34 35))"; + input_t inp = {0}; + sv_t inp_sv = SV(inp_data, sizeof(inp_data)); + input_from_sv(&ctx, &inp, filename, inp_sv); +#endif + + if (inp.str.size == 0) + { + info("[WARNING] `%s` is empty.\n", filename); + goto end; + } + debug("[file read]: %luB read from `%s`\n", inp.str.size, filename); + + // Setup a vector to hold all the lisps + vec_t results = {0}; + perr_t perr = parse_all(&ctx, &inp, &results); + if (perr) + { + exit = perr; + print_perror(stderr, &inp, perr); + goto end; + } + context_reset_read(&ctx); + + lisp_t **lisps = (lisp_t **)results.data; + u64 size = results.size / sizeof(*lisps); + for (u64 i = 0; i < size; ++i) + { + lisp_t *lisp = lisps[i]; + // printf("tag=%x\n", tag_get(lisp)); + sv_t serialised = serialise(&ctx, lisp); + info("lisp[%lu]: %p => " PR_SV "\n", i, lisp, SV_FMT(serialised)); + } + context_report(&ctx); + +end: + + context_cleanup(&ctx); + return exit; +} diff --git a/memory.c b/memory.c new file mode 100644 index 0000000..0d9fe13 --- /dev/null +++ b/memory.c @@ -0,0 +1,275 @@ +/* Copyright (C) 2025 Aryadev Chavali + + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License Version 2 for + * details. + + * You may distribute and modify this code under the terms of the GNU General + * Public License Version 2, which you should have received a copy of along with + * this program. If not, please go to <https://www.gnu.org/licenses/>. + + * Created: 2025-04-05 + * Description: Implementations for memory models. + */ + +#include "./memory.h" + +#include <malloc.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +page_t *page_create(u64 size) +{ + size = MAX(size, PAGE_DEFAULT_SIZE); + page_t *page = calloc(1, sizeof(*page) + size); + page->next = NULL; + page->size = 0; + page->capacity = size; + return page; +} + +void page_resize(page_t **page, u64 size) +{ + if (!page) + return; + else if (!(*page)) + *page = page_create(size); + else if (page[0]->capacity < size) + { + page[0]->capacity = MAX(size, page[0]->capacity * 1.5); + page[0] = realloc(page[0], sizeof(*page[0]) + page[0]->capacity); + } +} + +i64 page_append(page_t *page, void *data, u64 size) +{ + if (!page || page->size + size >= page->capacity) + return -1; + if (data) + memcpy(page->data + page->size, data, size); + u64 ptr = page->size; + page->size += size; + return ptr; +} + +u64 page_rappend(page_t **page, void *data, u64 size) +{ + page_resize(page, page[0]->size + size); + if (data) + memcpy(page[0]->data + page[0]->size, data, size); + u64 ptr = page[0]->size; + page[0]->size += size; + return ptr; +} + +void *arena_alloc(arena_t *arena, u64 size) +{ + if (!arena) + return NULL; + else if (!arena->start) + { + arena->start = page_create(1); + arena->end = arena->start; + } + + page_t *best_fit; + for (best_fit = arena->start; best_fit; best_fit = best_fit->next) + if (best_fit->size + size + MEMORY_ALIGNMENT < best_fit->capacity) + break; + + if (!best_fit) + { + best_fit = page_create(size); + arena->end->next = best_fit; + arena->end = best_fit; + } + + // NOTE: Fsanitize has a hissy fit if we don't align memory "correctly" + u64 offset = 0; + + if (size % MEMORY_ALIGNMENT == 0) + { + u64 div = (((u64)(best_fit->data + best_fit->size)) % MEMORY_ALIGNMENT); + if (div != 0) + offset = MEMORY_ALIGNMENT - div; + } + + void *start = best_fit->data + best_fit->size + offset; + + best_fit->size += size + offset; + + return start; +} + +void *arena_realloc(arena_t *arena, void *ptr, u64 oldsize, u64 newsize) +{ + if (!ptr) + return arena_alloc(arena, newsize); + else if (!arena || !arena->start) + return NULL; + else if (newsize <= oldsize) + // No need to change anything. + return ptr; + + bool copy_into_new = true; + void *start = NULL; + page_t *old_page = NULL, *best_fit = NULL; + + for (page_t *page = arena->start; page; page = page->next) + { + if (!best_fit && page->size + newsize < page->capacity) + best_fit = page; + if (page->data <= (u8 *)ptr && + (u8 *)(ptr) + oldsize <= page->data + page->capacity) + old_page = page; + } + + // If the old page exists, ptr is the latest allocation in it, and it has + // enough space to contain the new size, just resize and return. + if (old_page && old_page->data + old_page->size - oldsize == ptr && + old_page->size - oldsize + newsize < old_page->capacity) + { + start = ptr; + old_page->size += newsize - oldsize; + copy_into_new = false; + } + else + { + if (!old_page) + copy_into_new = false; + if (!best_fit) + { + best_fit = page_create(newsize); + arena->end->next = best_fit; + arena->end = best_fit; + } + + start = best_fit->data + best_fit->size; + best_fit->size += newsize; + } + + if (copy_into_new) + { + memcpy(start, ptr, oldsize); + memset(start + oldsize, 0, newsize - oldsize); + } + + return start; +} + +void arena_attach(arena_t *arena, page_t *page) +{ + if (!arena || !page) + return; + else if (!arena->start || !arena->end) + { + arena->start = page; + arena->end = page; + } + else + { + page->next = arena->start; + arena->start = page; + } +} + +void arena_reset(arena_t *arena) +{ + if (!arena || !arena->start) + return; + + for (page_t *cur = arena->start; cur; cur = cur->next) + { + if (cur->size == 0) + continue; + cur->size = 0; + memset(cur->data, 0, cur->capacity); + } +} + +void arena_cleanup(arena_t *arena) +{ + if (!arena || !arena->start) + return; + + for (page_t *cur = arena->start, *next = NULL; cur; + next = cur->next, free(cur), cur = next) + continue; + + memset(arena, 0, sizeof(*arena)); +} + +// Allocates against stable memory i.e. we can have pointers of this lying +// around without any fear of them being thrown away. +void *context_alloc(context_t *context, u64 size) +{ + return arena_alloc(&context->memory, size); +} + +// Allocate against a "scratch space", separate from main memory, for internal +// use. +void *context_salloc(context_t *context, u64 size) +{ + return arena_alloc(&context->scratch, size); +} + +void context_reset_read(context_t *context) +{ + arena_reset(&context->read); +} + +void context_reset_scratch(context_t *context) +{ + arena_reset(&context->scratch); +} + +void context_reset(context_t *context) +{ + arena_reset(&context->memory); + arena_reset(&context->read); + arena_reset(&context->scratch); +} + +void context_cleanup(context_t *context) +{ + if (!context) + return; + arena_cleanup(&context->memory); + arena_cleanup(&context->read); + arena_cleanup(&context->scratch); + memset(context, 0, sizeof(*context)); +} + +void context_report(context_t *context) +{ +#if DEBUG + // Figure this out at runtime + u64 mem_used = 0, mem_cap = 0; + for (page_t *page = context->memory.start; page; page = page->next) + { + mem_used += page->size; + mem_cap += page->capacity; + } + + u64 read_used = 0, read_cap = 0; + for (page_t *page = context->read.start; page; page = page->next) + { + read_used += page->size; + read_cap += page->capacity; + } + + u64 scr_used = 0, scr_cap = 0; + for (page_t *page = context->scratch.start; page; page = page->next) + { + scr_used += page->size; + scr_cap += page->capacity; + } + + info("<Context>: %luB/%luB main memory used\n", mem_used, mem_cap); + info("<Context>: %luB/%luB read space used\n", read_used, read_cap); + info("<Context>: %luB/%luB scratch space used\n", scr_used, scr_cap); +#endif +} diff --git a/memory.h b/memory.h new file mode 100644 index 0000000..bbd4d45 --- /dev/null +++ b/memory.h @@ -0,0 +1,66 @@ +/* Copyright (C) 2025 Aryadev Chavali + + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License Version 2 for + * details. + + * You may distribute and modify this code under the terms of the GNU General + * Public License Version 2, which you should have received a copy of along with + * this program. If not, please go to <https://www.gnu.org/licenses/>. + + * Created: 2025-04-05 + * Description: Memory management structures + */ + +#ifndef MEMORY_H +#define MEMORY_H + +#include "./base.h" +#include <stdio.h> + +#define PAGE_DEFAULT_SIZE 1024 +#define MEMORY_ALIGNMENT 8 + +typedef struct Page +{ + struct Page *next; + u64 size, capacity; + u8 data[]; +} page_t; + +page_t *page_create(u64 size); +void page_resize(page_t **page, u64 newsize); +// Append - fail (by returning <0) if not enough space. +i64 page_append(page_t *page, void *data, u64 size); +// Append - will resize if necessary +u64 page_rappend(page_t **page, void *data, u64 size); + +typedef struct Aren +{ + page_t *start, *end; +} arena_t; + +void *arena_alloc(arena_t *arena, u64 size); +void *arena_realloc(arena_t *arena, void *ptr, u64 oldsize, u64 newsize); +void *arena_copy(arena_t *arena, void *ptr, u64 size); +void arena_attach(arena_t *arena, page_t *page); +void arena_reset(arena_t *arena); +void arena_cleanup(arena_t *arena); + +typedef struct Context +{ + arena_t memory, read, scratch; +} context_t; + +void *context_alloc(context_t *context, u64 size); +void *context_salloc(context_t *context, u64 size); +page_t *context_get_read_page(context_t *ctx); +void context_reset_read(context_t *context); +void context_reset_scratch(context_t *context); + +void context_reset(context_t *context); +void context_cleanup(context_t *context); +void context_report(context_t *context); + +#endif diff --git a/reader.c b/reader.c new file mode 100644 index 0000000..b9217d3 --- /dev/null +++ b/reader.c @@ -0,0 +1,419 @@ +/* Copyright (C) 2025 Aryadev Chavali + + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License Version 2 for + * details. + + * You may distribute and modify this code under the terms of the GNU General + * Public License Version 2, which you should have received a copy of along with + * this program. If not, please go to <https://www.gnu.org/licenses/>. + + * Created: 2025-04-16 + * Description: Implementation of parser + */ + +#include "./reader.h" + +#include <ctype.h> +#include <string.h> + +bool is_digit(char c) +{ + return isdigit(c); +} + +bool is_alpha(char c) +{ + return isalpha(c); +} + +bool is_space(char c) +{ + return isspace(c); +} + +bool is_skip(char c) +{ + return is_space(c) || c == ';'; +} + +bool is_sym(char c) +{ + return strchr(SYM_CHARS, c) != NULL; +} + +void input_from_sv(context_t *ctx, input_t *inp, const char *name, sv_t sv) +{ + inp->name = name; + inp->str = sv_copy(&ctx->read, sv); +} + +void input_from_fp(context_t *ctx, input_t *input, const char *name, FILE *fp) +{ + input->name = name; + // TODO: Choose a best fit (i.e. maximal capacity, unused) page + page_t *page = page_create(INPUT_CHUNK_SIZE); + // chunk should be in scratch space so we can reset it later. + char *chunk = context_salloc(ctx, INPUT_CHUNK_SIZE); + + u64 total_size = 0, size_read = 0; + while (!feof(fp)) + { + size_read = fread(chunk, 1, INPUT_CHUNK_SIZE, fp); + if (size_read > 0) + { + page_rappend(&page, chunk, size_read); + total_size += size_read; + } + } + + input->str = SV((char *)page->data, total_size); + + // Memory cleanup + context_reset_scratch(ctx); + arena_attach(&ctx->read, page); +} + +bool input_eof(input_t *input) +{ + return !input || (input->offset >= input->str.size) || + (input->str.data[input->offset] == '\0'); +} + +char input_peek(input_t *input, u64 offset) +{ + if (input_eof(input) || input->offset + offset >= input->str.size) + return '\0'; + return input->str.data[input->offset + offset]; +} + +char input_next(input_t *input, u64 offset) +{ + if (input_eof(input) || input->offset + offset >= input->str.size) + return '\0'; + input->offset += offset; + return input->str.data[input->offset]; +} + +void input_skip(input_t *inp) +{ + // n + 2 lookup + sv_t current = sv_cut(inp->str, inp->offset); + sv_t lookup = sv_chop(current, 2); + while ((!input_eof(inp) && is_space(lookup.data[0])) || + lookup.data[0] == ';' || strncmp(lookup.data, "#|", 2) == 0) + { + if (lookup.data[0] == ';') + { + i64 newline = sv_find_subcstr(current, "\n", 1); + if (newline < 0) + inp->offset = inp->str.size; + else + inp->offset += newline + 1; + } + else if (strncmp(lookup.data, "#|", 2) == 0) + { + i64 offset = sv_find_subcstr(current, "|#", 2); + if (offset < 0) + inp->offset = inp->str.size; + else + inp->offset += offset + 2; + } + + inp->offset += sv_while(sv_cut(inp->str, inp->offset), is_space); + current = sv_cut(inp->str, inp->offset); + lookup = sv_chop(current, 2); + } +} + +perr_t parse_int(context_t *ctx, input_t *inp, lisp_t **ret) +{ + debug("parse_int[%lu] => ", inp->offset); + + // TODO: Parse arbitrary sized integers + (void)ctx; + + bool negative = (input_peek(inp, 0) == '-'); + sv_t current = sv_cut(inp->str, inp->offset + (negative ? 1 : 0)); + sv_t digits = sv_chop(current, sv_while(current, is_digit)); + + debug("`" PR_SV "` => ", SV_FMT(digits)); + i64 x = (negative ? -1L : 1L) * strtol(digits.data, NULL, 10); + debug("%ld\n", x); + + input_next(inp, digits.size + (negative ? 1 : 0)); + + *ret = make_int(x); + return PERR_OK; +} + +perr_t parse_sym(context_t *ctx, input_t *inp, lisp_t **ret) +{ + debug("parse_sym[%lu] => ", inp->offset); + + sv_t current = sv_cut(inp->str, inp->offset); + sv_t sym = sv_chop(current, sv_while(current, is_sym)); + debug("`" PR_SV "`\n", SV_FMT(sym)); + + if (sym.size == 3) + { + // NOTE: We can't mutate sym directly because it's on `read` space. + + // TODO: Make this beautiful please. + char buf[3]; + for (u64 i = 0; i < 3; ++i) + buf[i] = toupper(sym.data[i]); + + // NOTE: NIL symbol to actual NIL + if (strncmp(buf, "NIL", 3) == 0) + { + input_next(inp, 3); + return NIL; + } + } + + lisp_t *lsym = make_sym(ctx, sym.data, sym.size); + input_next(inp, sym.size); + *ret = lsym; + + return PERR_OK; +} + +perr_t parse_bool(context_t *ctx, input_t *inp, lisp_t **ret) +{ + (void)ctx; + debug("parse_bool[%lu] => ", inp->offset); + char c = input_peek(inp, 1); + bool b = -1; + if (c == 't') + b = true; + else if (c == 'f') + b = false; + else + return PERR_EXPECTED_BOOLEAN; + *ret = tag_bool(b); + input_next(inp, 2); + return PERR_OK; +} + +perr_t parse_cons(context_t *ctx, input_t *inp, lisp_t **ret) +{ + // TODO: Put this in a symbol table + lisp_t *lisp_dot = make_sym(ctx, ".", 1); + debug("parse_cons[%lu] => (\n", inp->offset); + inp->offset += 1; + + lisp_t *root = NIL; + lisp_t **cur = NIL; + bool dotted = false; + + while (!input_eof(inp) && input_peek(inp, 0) != ')') + { + lisp_t *lisp = NIL; + perr_t res = parse(ctx, inp, &lisp); + if (res) + return res; + + // This is cheap to do + if (lisp == lisp_dot) + { + dotted = true; + continue; + } + + if (!root) + { + root = make_cons(ctx, lisp, NIL); + cur = &root; + } + else if (!dotted) + *cur = make_cons(ctx, lisp, NIL); + else + *cur = lisp; + + if (cur && !dotted) + cur = &as_cons(*cur)->cdr; + + input_skip(inp); + } + + if (input_peek(inp, 0) != ')') + return PERR_EXPECTED_CLOSE_BRACKET; + + input_next(inp, 1); + + debug(")\n"); + *ret = root; + return PERR_OK; +} + +perr_t parse_vec(context_t *ctx, input_t *inp, lisp_t **ret) +{ + debug("parse_vec[%lu] => [\n", inp->offset); + input_next(inp, 2); + + lisp_t *lvec = make_vec(ctx, 0); + vec_t *vec = as_vec(lvec); + + while (!input_eof(inp) && input_peek(inp, 0) != ')') + { + lisp_t *lisp = NIL; + perr_t res = parse(ctx, inp, &lisp); + if (res) + return res; + + vec_append(&ctx->memory, vec, &lisp, sizeof(lisp)); + input_skip(inp); + } + + if (input_peek(inp, 0) != ')') + return PERR_EXPECTED_CLOSE_BRACKET; + + input_next(inp, 1); + + debug("]\n"); + *ret = lvec; + return PERR_OK; +} + +perr_t parse_str(context_t *ctx, input_t *inp, lisp_t **ret) +{ + debug("parse_str[%lu] => ", inp->offset); + input_next(inp, 1); // 1 for the first speechmark + sv_t sv = sv_cut(inp->str, inp->offset); + i64 size = sv_find_subcstr(sv, "\"", 1); + if (size < 0) + return PERR_EXPECTED_SPEECH_MARK; + + input_next(inp, size + 1); // 1 for that last speechmark + sv_t str_content = sv_chop(sv, size); + debug("\"" PR_SV "\"\n", SV_FMT(str_content)); + *ret = make_str(ctx, str_content.data, str_content.size); + return PERR_OK; +} + +perr_t parse_quote(context_t *ctx, input_t *inp, lisp_t **ret) +{ + char c = input_peek(inp, 0); + if (!(c == '\'' || c == '`')) + return PERR_UNEXPECTED_CHAR; + input_next(inp, 1); + sv_t prefix = {0}; + if (c == '\'') + prefix = SV("quote", 5); + else if (c == '`') + prefix = SV("quasiquote", 10); + lisp_t *root = make_cons(ctx, make_sym(ctx, prefix.data, prefix.size), NIL); + lisp_t *rest = NIL; + perr_t perr = parse(ctx, inp, &rest); + if (perr) + return perr; + CDR(root) = make_cons(ctx, rest, NIL); + *ret = root; + return PERR_OK; +} + +// TODO: Make this interactable with user once we have evaluation +perr_t parse_reader_macro(context_t *ctx, input_t *inp, lisp_t **ret) +{ + char c = input_peek(inp, 1); + if (c == '\\') + { + // character or weird base integer + TODO("Not implemented reader macro for characters or weird bases"); + } + else if (c == '(') + { + return parse_vec(ctx, inp, ret); + } + else if (c == 't' || c == 'f') + return parse_bool(ctx, inp, ret); + return PERR_UNEXPECTED_READER_MACRO_SYMBOL; +} + +static_assert(NUM_TAGS == 9); +perr_t parse(context_t *ctx, input_t *inp, lisp_t **ret) +{ + debug("parse => "); + input_skip(inp); + if (input_eof(inp)) + return PERR_EOF; + + char c = input_peek(inp, 0); + + if (is_digit(c) || (c == '-' && is_digit(input_peek(inp, 1)))) + return parse_int(ctx, inp, ret); + else if (c == '#') + return parse_reader_macro(ctx, inp, ret); + else if (is_sym(c)) + return parse_sym(ctx, inp, ret); + else if (c == '(') + return parse_cons(ctx, inp, ret); + else if (c == '\'' || c == '`') + return parse_quote(ctx, inp, ret); + else if (c == '\"') + return parse_str(ctx, inp, ret); + else + return PERR_UNEXPECTED_CHAR; +} + +perr_t parse_all(context_t *ctx, input_t *inp, vec_t *vec) +{ + while (!input_eof(inp)) + { + lisp_t *member = NIL; + perr_t err = parse(ctx, inp, &member); + + if (err) + return err; + else + vec_append(&ctx->scratch, vec, &member, sizeof(member)); + + input_skip(inp); + } + return PERR_OK; +} + +int print_perror(FILE *fp, input_t *inp, perr_t error) +{ + pos_t pos = input_offset_to_pos(inp); + fprintf(fp, "%s:%lu:%lu: %s", inp->name, pos.line, pos.col, + perr_to_cstr(error)); + switch (error) + { + case PERR_UNEXPECTED_CHAR: + fprintf(fp, "(`%c`)", input_peek(inp, 0)); + break; + case PERR_OK: + case PERR_EOF: + case PERR_EXPECTED_BOOLEAN: + case PERR_UNEXPECTED_READER_MACRO_SYMBOL: + case PERR_EXPECTED_CLOSE_BRACKET: + case PERR_EXPECTED_SPEECH_MARK: + default: + break; + } + fprintf(stderr, "\n"); + + return error; +} + +pos_t input_offset_to_pos(input_t *inp) +{ + pos_t pos = {.col = 1, .line = 1}; + for (u64 i = 0; i < inp->offset && i < inp->str.size; ++i) + { + char c = (inp->str.data[i]); + if (c == '\n') + { + ++pos.line; + pos.col = 1; + } + else + { + ++pos.col; + } + } + return pos; +} diff --git a/reader.h b/reader.h new file mode 100644 index 0000000..07c3672 --- /dev/null +++ b/reader.h @@ -0,0 +1,98 @@ +/* Copyright (C) 2025 Aryadev Chavali + + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License Version 2 for + * details. + + * You may distribute and modify this code under the terms of the GNU General + * Public License Version 2, which you should have received a copy of along with + * this program. If not, please go to <https://www.gnu.org/licenses/>. + + * Created: 2025-04-16 + * Description: Parsing Lisp! + */ + +#ifndef READER_H +#define READER_H + +#include "./lisp.h" +#include "./memory.h" +#include "./vec.h" + +#define INPUT_CHUNK_SIZE 512 +static const char SYM_CHARS[] = + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" + "¬!£$%^&*_-+={[]}:@~#<,>.?/"; + +typedef struct +{ + const char *name; + u64 offset; + sv_t str; +} input_t; + +void input_from_sv(context_t *ctx, input_t *inp, const char *name, sv_t sv); +void input_from_fp(context_t *ctx, input_t *input, const char *name, FILE *fp); +bool input_eof(input_t *input); + +typedef struct +{ + u64 col, line; +} pos_t; + +pos_t input_offset_to_pos(input_t *inp); + +typedef enum +{ + PERR_OK = 0, + PERR_EOF, + PERR_UNEXPECTED_CHAR, + PERR_EXPECTED_CLOSE_BRACKET, + PERR_EXPECTED_SPEECH_MARK, + PERR_UNEXPECTED_READER_MACRO_SYMBOL, + PERR_EXPECTED_BOOLEAN, +} perr_t; + +static inline const char *perr_to_cstr(perr_t perr) +{ + switch (perr) + { + case PERR_OK: + return "OK"; + break; + case PERR_EOF: + return "EOF"; + break; + case PERR_UNEXPECTED_CHAR: + return "UNEXPECTED_CHAR"; + break; + case PERR_EXPECTED_CLOSE_BRACKET: + return "EXPECTED_CLOSE_BRACKET"; + break; + case PERR_EXPECTED_SPEECH_MARK: + return "EXPECTED_SPEECH_MARK"; + break; + case PERR_UNEXPECTED_READER_MACRO_SYMBOL: + return "UNEXPECTED_READER_MACRO_SYMBOL"; + break; + case PERR_EXPECTED_BOOLEAN: + return "EXPECTED_BOOLEAN"; + break; + } + assert(false && "perr_to_cstr: unreachable"); + return ""; +} + +typedef struct +{ + lisp_t *result; + perr_t error; +} pres_t; + +perr_t parse(context_t *ctx, input_t *str, lisp_t **ret); +perr_t parse_all(context_t *ctx, input_t *str, vec_t *vec); + +int print_perror(FILE *fp, input_t *inp, perr_t error); + +#endif @@ -0,0 +1,158 @@ +/* Copyright (C) 2025 Aryadev Chavali + + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License Version 2 for + * details. + + * You may distribute and modify this code under the terms of the GNU General + * Public License Version 2, which you should have received a copy of along with + * this program. If not, please go to <https://www.gnu.org/licenses/>. + + * Created: 2025-04-14 + * Description: String View implementation + */ + +#include "./sv.h" + +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +sv_t sv_make(arena_t *allocator, const char *data, u64 size) +{ + sv_t s = {0}; + if (data) + s = sv_append(allocator, s, data, size); + else + { + s.data = arena_alloc(allocator, size); + s.size = size; + } + return s; +} + +sv_t sv_copy(arena_t *allocator, sv_t sv) +{ + return sv_make(allocator, sv.data, sv.size); +} + +sv_t sv_substr(sv_t sv, u64 index, u64 size) +{ + sv_t newsv = {0}; + if (index + size > sv.size) + return newsv; + newsv.data = sv.data + index; + newsv.size = size; + return newsv; +} + +sv_t sv_cut(sv_t sv, u64 index) +{ + return sv_substr(sv, index, sv.size < index ? 0 : sv.size - index); +} + +sv_t sv_chop(sv_t sv, u64 size) +{ + return sv_substr(sv, 0, size); +} + +sv_t sv_concat(arena_t *allocator, sv_t a, sv_t b) +{ + sv_t c = sv_make(allocator, a.data, a.size + b.size); + memcpy(c.data + a.size, b.data, b.size); + return c; +} + +sv_t sv_append(arena_t *allocator, sv_t sv, const char *data, u64 size) +{ + if (!allocator) + return (sv_t){0}; + + sv_t newsv = {0}; + newsv.size = sv.size + size; + newsv.data = arena_realloc(allocator, sv.data, sv.size, newsv.size); + if (data) + memcpy(newsv.data + sv.size, data, size); + return newsv; +} + +sv_t sv_prepend(arena_t *allocator, sv_t sv, const char *data, u64 size) +{ + if (!allocator) + return (sv_t){0}; + + // TODO: Can we make this cheaper to do? + sv_t newsv = sv_make(allocator, NULL, size + sv.size); + // Copy over `data` to the left side + memcpy(newsv.data, data, size); + // Copy old string to the right side + if (sv.data) + memcpy(newsv.data + size, sv.data, sv.size); + + return newsv; +} + +sv_t sv_fmt(arena_t *allocator, char *fmt, ...) +{ + if (!allocator) + return (sv_t){0}; + + va_list ap_1, ap_2; + va_start(ap_1, fmt); + + va_copy(ap_2, ap_1); + u64 size = vsnprintf(NULL, 0, fmt, ap_2); + va_end(ap_2); + + sv_t sv = sv_make(allocator, NULL, size); + vsprintf(sv.data, fmt, ap_1); + va_end(ap_1); + + return sv; +} + +i64 sv_find_substr(const sv_t sv, const sv_t substr) +{ + if (substr.size == 0) + return 0; + else if (sv.size < substr.size) + return -1; + else if (sv.size == substr.size) + return strncmp(sv.data, substr.data, sv.size) == 0 ? 0 : -1; + + for (u64 i = 0; i < (sv.size - substr.size); ++i) + if (strncmp(sv.data + i, substr.data, substr.size) == 0) + return i; + return -1; +} + +i64 sv_find_subcstr(const sv_t sv, const char *substr, u64 size) +{ + return sv_find_substr(sv, SV((char *)substr, size)); +} + +i64 sv_find_any(const sv_t sv, const char *bag) +{ + for (u64 i = 0; i < sv.size; ++i) + if (strchr(bag, sv.data[i])) + return i; + return -1; +} + +u64 sv_while(const sv_t sv, bool (*pred)(char)) +{ + u64 i; + for (i = 0; i < sv.size && pred(sv.data[i]); ++i) + continue; + return i; +} + +u64 sv_till(const sv_t sv, bool (*pred)(char)) +{ + u64 i; + for (i = 0; i < sv.size && !pred(sv.data[i]); ++i) + continue; + return i; +} @@ -0,0 +1,70 @@ +/* Copyright (C) 2025 Aryadev Chavali + + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License Version 2 for + * details. + + * You may distribute and modify this code under the terms of the GNU General + * Public License Version 2, which you should have received a copy of along with + * this program. If not, please go to <https://www.gnu.org/licenses/>. + + * Created: 2025-04-14 + * Description: String Views + */ + +#ifndef SV_H +#define SV_H + +#include "./memory.h" + +typedef struct SV +{ + u64 size; + char *data; +} sv_t; + +#define SV(DATA, SIZE) ((const sv_t){.size = (SIZE), .data = (DATA)}) +#define SV_FMT(SV) (int)(SV).size, (SV).data +#define PR_SV "%.*s" + +sv_t sv_make(arena_t *allocator, const char *data, u64 size); +sv_t sv_copy(arena_t *allocator, sv_t sv); +sv_t sv_append(arena_t *allocator, sv_t sv, const char *data, u64 size); +sv_t sv_prepend(arena_t *allocator, sv_t sv, const char *data, u64 size); + +/** + * @brief Concatenate two string views, returning that concatenation. + + * Allocates memory. + */ +sv_t sv_concat(arena_t *allocator, sv_t a, sv_t b); + +/** + * @brief Allocates a string view with the given `printf` format. + */ +sv_t sv_fmt(arena_t *allocator, char *fmt, ...); + +/** + * @brief Constructs a new string view at a different offset. Does not allocate + * new memory. + */ +sv_t sv_substr(sv_t sv, u64 index, u64 size); +/** + * @brief Return a string view INDEX characters ahead (i.e. cut the string from + * the left). + */ +sv_t sv_cut(sv_t sv, u64 index); +/** + * @brief Return a string view with SIZE (i.e. chopping the string from the + * right) + */ +sv_t sv_chop(sv_t sv, u64 size); + +i64 sv_find_substr(const sv_t sv, const sv_t substr); +i64 sv_find_subcstr(const sv_t sv, const char *substr, u64 size); +i64 sv_find_any(const sv_t sv, const char *bag); +u64 sv_while(const sv_t sv, bool (*pred)(char)); +u64 sv_till(const sv_t sv, bool (*pred)(char)); + +#endif @@ -0,0 +1,94 @@ +/* Copyright (C) 2025 Aryadev Chavali + + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License Version 2 for + * details. + + * You may distribute and modify this code under the terms of the GNU General + * Public License Version 2, which you should have received a copy of along with + * this program. If not, please go to <https://www.gnu.org/licenses/>. + + * Created: 2025-04-06 + * Description: + */ + +#include <string.h> +#include <tag.h> + +lisp_t *tag_int(i64 i) +{ + return TAG((u64)i, INT); +} + +lisp_t *tag_cons(void *ptr) +{ + return TAG((u64)ptr, CONS); +} + +lisp_t *tag_sym(void *ptr) +{ + return TAG((u64)ptr, SYM); +} + +lisp_t *tag_ssym(const char *data, size_t size) +{ + assert(size <= 7); + u8 buffer[sizeof(u64)]; + memset(buffer, 0, sizeof(buffer)); + // in 8 bits we have: + // size - 3 bits (up to 7) + // tag - 5 bits + buffer[0] = size; + buffer[0] <<= SHIFT_SSYM; + buffer[0] |= TAG_SSYM; + memcpy(buffer + 1, data, size); + u64 word = 0; + memcpy(&word, buffer, sizeof(u64)); + return (lisp_t *)word; +} + +lisp_t *tag_bool(bool b) +{ + return TAG((u64)b, BOOL); +} + +lisp_t *tag_vec(void *ptr) +{ + return TAG((u64)ptr, VEC); +} + +lisp_t *tag_str(void *ptr) +{ + return TAG((u64)ptr, STR); +} + +lisp_t *tag_char(u32 codepoint) +{ + u64 w = codepoint; + return TAG(w, CHAR); +} + +enum Tag tag_get(lisp_t *ptr) +{ + static_assert(NUM_TAGS == 9); + if (!ptr) + return TAG_NIL; + else if (IS_TAG(ptr, INT)) + return TAG_INT; + else if (IS_TAG(ptr, CHAR)) + return TAG_CHAR; + else if (IS_TAG(ptr, SYM)) + return TAG_SYM; + else if (IS_TAG(ptr, SSYM)) + return TAG_SSYM; + else if (IS_TAG(ptr, BOOL)) + return TAG_BOOL; + else if (IS_TAG(ptr, VEC)) + return TAG_VEC; + else if (IS_TAG(ptr, STR)) + return TAG_STR; + else if (IS_TAG(ptr, CONS)) + return TAG_CONS; + return 0; +} @@ -0,0 +1,90 @@ +/* Copyright (C) 2025 Aryadev Chavali + + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License Version 2 for + * details. + + * You may distribute and modify this code under the terms of the GNU General + * Public License Version 2, which you should have received a copy of along with + * this program. If not, please go to <https://www.gnu.org/licenses/>. + + * Created: 2025-04-06 + * Description: + */ + +#ifndef TAG_H +#define TAG_H + +#include "./base.h" + +#include <stdbool.h> +#include <stdlib.h> + +// Opaque structure to make tagged pointers a separate type from general +// pointers +typedef struct Obj lisp_t; + +enum Tag +{ + TAG_NIL = 0b00000000, // Atomic types + TAG_INT = 0b00000001, // special so we can encode 63 bit integers + TAG_SYM = 0b00000100, + TAG_SSYM = 0b00001000, + TAG_CHAR = 0b00001100, + TAG_BOOL = 0b00010000, + // TAG_FLOAT = 0b00010100, + TAG_CONS = 0b00000010, // Container types + TAG_VEC = 0b00000110, + TAG_STR = 0b00001010, + + NUM_TAGS = 9, +}; + +enum Shift +{ + SHIFT_INT = 1, + SHIFT_SSYM = 5, + SHIFT_CHAR = 8, + SHIFT_SYM = 8, + SHIFT_BOOL = 8, + SHIFT_FLOAT = 8, + + SHIFT_CONS = 8, + SHIFT_VEC = 8, + SHIFT_STR = 8, +}; + +enum Mask +{ + MASK_INT = 0b00000001, + MASK_SSYM = 0b00011111, + MASK_SYM = 0b11111111, + MASK_CHAR = 0b11111111, + MASK_BOOL = 0b11111111, + MASK_FLOAT = 0b11111111, + + MASK_CONS = 0b11111111, + MASK_VEC = 0b11111111, + MASK_STR = 0b11111111, +}; + +#define TAG(PTR, TYPE) ((lisp_t *)(((PTR) << SHIFT_##TYPE) | TAG_##TYPE)) +#define IS_TAG(PTR, TYPE) (((u64)(PTR) & MASK_##TYPE) == TAG_##TYPE) +#define UNTAG(PTR, TYPE) (((u64)PTR) >> SHIFT_##TYPE) + +enum Tag tag_get(lisp_t *ptr); + +lisp_t *tag_int(i64 i); +lisp_t *tag_char(u32 codepoint); +lisp_t *tag_sym(void *ptr); +lisp_t *tag_ssym(const char *data, size_t size); +lisp_t *tag_bool(bool b); +lisp_t *tag_vec(void *ptr); +lisp_t *tag_str(void *ptr); +lisp_t *tag_cons(void *ptr); + +#define INT_MAX ((1L << 62) - 1) +#define INT_MIN (-(1L << 62)) + +#endif diff --git a/tasks.org b/tasks.org new file mode 100644 index 0000000..179e1a2 --- /dev/null +++ b/tasks.org @@ -0,0 +1,230 @@ +#+title: Tasks +#+date: 2025-02-18 + +* WIP Implement a reader +We want something a bit generic: able to handle reading from some +buffer of memory (a string, or contents of a file where we can read +the entire thing at once) or directly from a file stream (STDIN, +network streams, etc). + +We don't need a tokeniser - the basic grammar of a Lisp is really easy +to narrow down, so we can skip tokenisation and go straight for +parsing. + +We also want to be able to admit when reading went wrong for some +reason with proper errors messages (i.e. can be read by Emacs) - this +will need to be refactored when we introduce errors within the Lisp +runtime itself. +** TODO Implement floats and rationals +Rationals are pretty easy - just two integers (quotient and divisor) - +so a tagged cons cell would do the job. Floats are a bit more +difficult since I'd either need to box them or find a creative way of +sticking IEEE-754 floats into < 64 bits. + +Also implement a reader macro for #e<scientific form>. Also deal with +[-,+,]inf(.0) and [-,+,]nan(.0). + +Need to do some reading. + +[[file:r7rs-tests.scm::test #t (real? #e1e10)][trigger]] +** TODO Consider user instantiated reader macros +We don't have an evaluator so we can't really interpret whatever a +user wants for a reader macro currently, but it would be useful to +think about it now. Currently I have a single function which deals +with parsing reader macros but that's just static. + +Thing is, it does have the context as a parameter to pass to delegate +functions (such as ~parse_vec~) - wouldn't be a massive jump to also +consider user environments via the context. + +[[file:reader.c::perr_t parse_reader_macro(context_t *ctx, input_t +*inp, lisp_t **ret)][function link]] +* TODO Consider Lisp runtime errors +* TODO Admit arbitrarily sized integers +Currently we admit fixed size integers of 63 bits. They use 2s +complement due to x86 which means our max and min are 62 bit based. + +However, to even try to be a scheme implementation we need to allow +arbitrarily sized integers. What are the specific tasks we need to +complete in our model to achieve this?: ++ Allow "reading" of unfixed size integers + + This will require reading a sequence of base 10 digits without + relying on strtold ++ Implement unfixed size integers into our memory model + + Certainly, unfixed size integers cannot be carried around like + fixnums wherein we can embed an integer into the pointer. + Thus we have to allocate them in memory. + + NOTE: There will be definitely be an optimisation to be done + here; integers that are within the bound of a fixnum could be + left as a fixnum then "elevated" to an integer when needed + + I think the big idea is allocating them as a fixed set of bytes + like big symbols. For big integers we have to read the memory + associated thus we need a pointer. Due to 2s complement it should + be trivial to increase the size of an integer to fit a new result + i.e. if I'm adding two integers and that leads to an "overflow" + where the result is of greater width than its inputs, we should + just allocate new memory for it. + +Consequences: +- Greater memory use + - In fact exponential if we need to allocate a whole new integer per + operation rather than utilising the input memory +- Possible loss of performance due to making integers over fixnums + when they don't need to be +- Comparison is harder on integers +- Harder to cache for the CPU + +but all of this is to be expected when the user is an idiot. +* TODO Think about how to perform operations on different types +** TODO Integers +** TODO Symbols +** TODO Pairs +* DONE More efficient memory model for symbols +The primitive model for symbol allocation is an 8 byte number +representing the size of the symbol, followed by a variable number of +characters (as bytes). This is stored somewhere in the memory +context, which will likely be in the heap. + +We're actually wasting a ridiculous amount of memory with this model. +We'll almost never be using the full 64 bits of the size to represent +a symbol (who's ever going to go close to 1.8 quintillion bytes for a +single symbol?) - and even more annoying, there are tons of small +sized symbols where we actually need _more space_ for the 8 byte size +than the underlying symbol data. + +I think there's definitely a better model available, at least for +smaller symbols. We already have inlined integers where the pointer +itself is the integer, why can't we do the same for symbols? +A pointer has 8 bytes of data to use - one character is one byte - +thus we can represent 8 character symbols in one pointer. + +If we want this to still be within the remit of our pointer tagging +scheme, we'll need a bit of space for our administrative purposes +(admin slop...). So let's take _one_ byte out of that 8 for that. So +we can represent any symbols 7 bytes long in a single pointer. We'll +need to keep in mind we want to represent symbols that may be less +than 7 characters, so that one admin byte is going to be doing some +heavy lifting. + +Let's examine that one admin byte: ++ At least 3 bits are necessary for the actual pointer tag: "at least" + because we might increase the size of the tag based on demand ++ Thus, 5 bits left for our use - let's fit the size in there. + pow(2,6) - 1 = 63, so we have _way_ more than we need + +What are the benefits of doing this?: ++ Symbol equality for small symbols is a cinch: just compare the two + tagged "pointers" ++ 7 or less character symbols require no memory allocation, just + working off the stack + +One thing to note is that for more than 7 character symbols, we'll +need to allocate memory. But, in the worst case of 8 character +symbols, we're only allocating two 64 bit integers: these are easy to +walk on x86 and we've reached at least parity between the memory +required for administration (the size number) and the actual data. +** Being more aggressive? +Technically, ANSI bytes only need 7 bits. For each of the 7 bytes +used for the character data, we can take one bit off, leaving us with +7 bits to use for an additional character. We don't need to adjust +anything else in the schema. + +So, hypothetically we could represent up to 8 character symbols! This +would require packing the characters more aggressively into a single +pointer. Let's look at the layout of our pointers. This table is +indexed from most significant to least i.e. 0 is the MSB and 63 is the +LSB: + +|-------+------------| +| Index | Usage | +|-------+------------| +| 0-55 | Characters | +| 56-60 | Size | +| 61-63 | Tag | +|-------+------------| + +Honestly though, for an extra byte of information we'll probably have +to do a lot more work. x86-64 CPUs are much better at walking bytes +than they are walking 7 bit offsets. This may be something to +consider if CPU time is cheaper than allocating 8 byte symbols +somewhere. +* DONE Tagging scheme based on arena pages +2025-04-09:21:59:29: We went for option (2) of just taking a byte for +free from the memory address and using it as our management byte. +** 1) Page-offset schema +I've realised arenas are way better than the standard array dynamic I +was going for before. However, we lose the nicer semantics of using +an array index for pointers, where we can implement our own semantics +regarding what bits in that pointer are free to use, when using a +normal stable pointer into the arena; the host operating system has +its own semantics regarding how pointers are arranged and this _will_ +change between operating systems. In particular, because of the way +I've arranged pages, we can't use the classic "div by 8" methodology +where new allocations on the heap generally must be aligned by 8 bytes +(u64), so we can use those 3 bits at the bottom for our tagging; +offsets into pages are where our pointers will lie and they won't +necessarily be divisible by 8. + +So we can't use the pointers directly into the pages - we'll call +these types of pointers `host pointers`, because once we have them +it's trivial to access the underlying data. We'll call the pointers +we want to make `managed pointers` because we're managing the memory +system associated with them. We want to be able to translate from +managed pointers to host pointers. + +Managed pointers are really just encodings for direct access into the +arena memory. So in 8 bytes, we need to encode both the page and the +specific offset in that page where the pointer is pointing to. We +also want to leave space for tagging and any metadata we might want to +store in the pointer to that data. A schema I could think of was: + +|------------------+--------------------| +| Index (in bytes) | Representation | +|------------------+--------------------| +| 0 | Metadata (tagging) | +| 1-4 | Offset in page | +| 4-7 | Page choice | +|------------------+--------------------| + +This gives us pow(2, 24) - 1 = 16777215 possible pages and +pow(2, 32) - 1 = 4294967295 offsets in each page. Thus our total +addressable memory would be pow(2, 56) - 1 = 72057594037927935 bytes. + +Certainly no machine would ever have this much memory and so we're +quite safe for most machines. That reserved management byte for our +purposes (tagging, currently) will make the math to translate it a bit +easier. + +Let's reason about how we'd encode and decode these addresses. The +arena itself should provide addresses with the management byte set to +0 for the user to encode what they wish. The top bytes should be +encoded as per the above i.e. top 3 bytes as the page index, next 4 +bytes as the offset in that page. This shouldn't be super difficult +when we're doing it within the management functions of the arena +itself as this data should be handy when performing the allocation. + +When decoding these addresses i.e. retrieving data i.e. translating +from a managed pointer to a host pointer, all it will need to do is +convert the pointer into a byte buffer and copy the top 3 bytes as a +page index and the next 4 bytes as the offset in the page. Once these +are verified to be valid, we can just access the underlying pages and +get the host pointer. Because of how arenas work, those host pointers +will be stable regardless of any further memory management functions +performed on the arena (excluding cleanup) - so once you have a host +pointer, you can use it as much as you want without having to worry +about the pointer becoming invalid in the next second. +** 2) 48-bit addressing exploit +Most x86 CPUs only use around 48-56 bits for actual memory addresses - +mostly as a result of not needing _nearly_ as many addresses as a full +64 bit word would provide. + +So we /could/ get away with using one of those bytes for our +administrative tasks. Since the smallest remit we have is one byte, +we'll stick to that (but maybe we could go for two bytes - need to +investigate further). + +This byte should be the MSB, but using that for tagging will require +more work than the lowest byte (to look at it we'll need to push that +byte all the way down). So we'll be going for a low byte strategy by +shifting the pointer up by one byte. This leaves us with the lowest +byte to play with as we choose. @@ -0,0 +1,38 @@ +/* Copyright (C) 2025 Aryadev Chavali + + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License Version 2 for + * details. + + * You may distribute and modify this code under the terms of the GNU General + * Public License Version 2, which you should have received a copy of along with + * this program. If not, please go to <https://www.gnu.org/licenses/>. + + * Created: 2025-04-15 + * Description: Vector implementation + */ + +#include "./vec.h" + +#include <malloc.h> +#include <string.h> + +void vec_reserve(arena_t *allocator, vec_t *vec, u32 size) +{ + if (vec->cap - vec->size < size) + { + u32 old_cap = vec->cap; + vec->cap = MAX(vec->cap * 1.5, vec->size + size); + vec->data = arena_realloc(allocator, vec->data, old_cap, vec->cap); + } +} + +u32 vec_append(arena_t *allocator, vec_t *vec, const void *data, u32 size) +{ + vec_reserve(allocator, vec, size); + memcpy(vec->data + vec->size, data, size); + u32 ptr = vec->size; + vec->size += size; + return ptr; +} @@ -0,0 +1,30 @@ +/* Copyright (C) 2025 Aryadev Chavali + + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License Version 2 for + * details. + + * You may distribute and modify this code under the terms of the GNU General + * Public License Version 2, which you should have received a copy of along with + * this program. If not, please go to <https://www.gnu.org/licenses/>. + + * Created: 2025-04-15 + * Description: Vectors (dynamic sized arrays) + */ + +#ifndef VEC_H +#define VEC_H + +#include "./memory.h" + +typedef struct +{ + u32 size, cap; + u8 *data; +} vec_t; + +void vec_reserve(arena_t *alloactor, vec_t *vec, u32 size); +u32 vec_append(arena_t *allocator, vec_t *vec, const void *data, u32 size); + +#endif |