1 files changed, 426 insertions, 0 deletions
diff --git a/lisp/reader.c b/lisp/reader.c
new file mode 100644
index 0000000..0c8a914
--- /dev/null
+++ b/lisp/reader.c
@@ -0,0 +1,426 @@
+/* Copyright (C) 2025 Aryadev Chavali
+
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU General Public License Version 2 for
+ * details.
+
+ * You may distribute and modify this code under the terms of the GNU General
+ * Public License Version 2, which you should have received a copy of along with
+ * this program.  If not, please go to <https://www.gnu.org/licenses/>.
+
+ * Created: 2025-04-16
+ * Description: Implementation of parser
+ */
+
+#include <lisp/reader.h>
+
+#include <ctype.h>
+#include <string.h>
+
+bool is_digit(char c)
+{
+  return isdigit(c);
+}
+
+bool is_alpha(char c)
+{
+  return isalpha(c);
+}
+
+bool is_space(char c)
+{
+  return isspace(c);
+}
+
+bool is_skip(char c)
+{
+  return is_space(c) || c == ';';
+}
+
+bool is_sym(char c)
+{
+  return strchr(SYM_CHARS, c) != NULL;
+}
+
+void input_from_sv(context_t *ctx, input_t *inp, const char *name, sv_t sv)
+{
+  inp->name = name;
+  inp->str  = sv_copy(&ctx->read, sv);
+}
+
+void input_from_fp(context_t *ctx, input_t *input, const char *name, FILE *fp)
+{
+  input->name = name;
+  // TODO: Choose a best fit (i.e. maximal capacity, unused) page
+  page_t *page = page_create(INPUT_CHUNK_SIZE);
+  // chunk should be in scratch space so we can reset it later.
+  char *chunk = context_salloc(ctx, INPUT_CHUNK_SIZE);
+
+  u64 total_size = 0, size_read = 0;
+  while (!feof(fp))
+  {
+    size_read = fread(chunk, 1, INPUT_CHUNK_SIZE, fp);
+    if (size_read > 0)
+    {
+      page_rappend(&page, chunk, size_read);
+      total_size += size_read;
+    }
+  }
+
+  input->str = SV((char *)page->data, total_size);
+
+  // Memory cleanup
+  context_reset_scratch(ctx);
+  arena_attach(&ctx->read, page);
+}
+
+bool input_eof(input_t *input)
+{
+  return !input || (input->offset >= input->str.size) ||
+         (input->str.data[input->offset] == '\0');
+}
+
+char input_peek(input_t *input, u64 offset)
+{
+  if (input_eof(input) || input->offset + offset >= input->str.size)
+    return '\0';
+  return input->str.data[input->offset + offset];
+}
+
+char input_next(input_t *input, u64 offset)
+{
+  if (input_eof(input) || input->offset + offset >= input->str.size)
+    return '\0';
+  input->offset += offset;
+  return input->str.data[input->offset];
+}
+
+void input_skip(input_t *inp)
+{
+  while (!input_eof(inp))
+  {
+    sv_t current = sv_cut(inp->str, inp->offset);
+    // Skip any whitespace
+    inp->offset += sv_while(current, is_space);
+    current = sv_cut(inp->str, inp->offset);
+    // Is there a comment to skip?
+    if (current.size && current.data[0] == ';')
+    {
+      // Skip till newline
+      i64 newline = sv_find_subcstr(current, "\n", 1);
+      if (newline < 0)
+        inp->offset = inp->str.size;
+      else
+        inp->offset += newline + 1;
+      // Then skip any whitespace
+    }
+    // Multiline comment to skip?
+    else if (current.size > 2 && strncmp(current.data, "#|", 2) == 0)
+    {
+      i64 offset = sv_find_subcstr(current, "|#", 2);
+      if (offset < 0)
+        inp->offset = inp->str.size;
+      else
+        inp->offset += offset + 2;
+      // Then skip any whitespace
+    }
+    // Nothing to skip, stop.
+    else
+      break;
+  }
+  return;
+}
+
+perr_t parse_int(context_t *ctx, input_t *inp, lisp_t **ret)
+{
+  debug("parse_int[%lu] => ", inp->offset);
+
+  // TODO: Parse arbitrary sized integers
+  (void)ctx;
+
+  bool negative = (input_peek(inp, 0) == '-');
+  sv_t current  = sv_cut(inp->str, inp->offset + (negative ? 1 : 0));
+  sv_t digits   = sv_chop(current, sv_while(current, is_digit));
+
+  debug("`" PR_SV "` => ", SV_FMT(digits));
+  i64 x = (negative ? -1L : 1L) * strtol(digits.data, NULL, 10);
+  debug("%ld\n", x);
+
+  input_next(inp, digits.size + (negative ? 1 : 0));
+
+  *ret = make_int(x);
+  return PERR_OK;
+}
+
+perr_t parse_sym(context_t *ctx, input_t *inp, lisp_t **ret)
+{
+  debug("parse_sym[%lu] => ", inp->offset);
+
+  sv_t current = sv_cut(inp->str, inp->offset);
+  sv_t sym     = sv_chop(current, sv_while(current, is_sym));
+  debug("`" PR_SV "`\n", SV_FMT(sym));
+
+  if (sym.size == 3)
+  {
+    // NOTE: We can't mutate sym directly because it's on `read` space.
+
+    // TODO: Make this beautiful please.
+    char buf[3];
+    for (u64 i = 0; i < 3; ++i)
+      buf[i] = toupper(sym.data[i]);
+
+    // NOTE: NIL symbol to actual NIL
+    if (strncmp(buf, "NIL", 3) == 0)
+    {
+      input_next(inp, 3);
+      return NIL;
+    }
+  }
+
+  lisp_t *lsym = make_sym(ctx, sym.data, sym.size);
+  input_next(inp, sym.size);
+  *ret = lsym;
+
+  return PERR_OK;
+}
+
+perr_t parse_bool(context_t *ctx, input_t *inp, lisp_t **ret)
+{
+  (void)ctx;
+  debug("parse_bool[%lu] => ", inp->offset);
+  char c = input_peek(inp, 1);
+  bool b = -1;
+  if (c == 't')
+    b = true;
+  else if (c == 'f')
+    b = false;
+  else
+    return PERR_EXPECTED_BOOLEAN;
+  *ret = tag_bool(b);
+  input_next(inp, 2);
+  return PERR_OK;
+}
+
+perr_t parse_cons(context_t *ctx, input_t *inp, lisp_t **ret)
+{
+  // TODO: Put this in a symbol table
+  lisp_t *lisp_dot = make_sym(ctx, ".", 1);
+  debug("parse_cons[%lu] => (\n", inp->offset);
+  inp->offset += 1;
+
+  lisp_t *root = NIL;
+  lisp_t **cur = NIL;
+  bool dotted  = false;
+
+  while (!input_eof(inp) && input_peek(inp, 0) != ')')
+  {
+    lisp_t *lisp = NIL;
+    perr_t res   = parse(ctx, inp, &lisp);
+    if (res)
+      return res;
+
+    // This is cheap to do
+    if (lisp == lisp_dot)
+    {
+      dotted = true;
+      continue;
+    }
+
+    if (!root)
+    {
+      root = make_cons(ctx, lisp, NIL);
+      cur  = &root;
+    }
+    else if (!dotted)
+      *cur = make_cons(ctx, lisp, NIL);
+    else
+      *cur = lisp;
+
+    if (cur && !dotted)
+      cur = &as_cons(*cur)->cdr;
+
+    input_skip(inp);
+  }
+
+  if (input_peek(inp, 0) != ')')
+    return PERR_EXPECTED_CLOSE_BRACKET;
+
+  input_next(inp, 1);
+
+  debug(")\n");
+  *ret = root;
+  return PERR_OK;
+}
+
+perr_t parse_vec(context_t *ctx, input_t *inp, lisp_t **ret)
+{
+  debug("parse_vec[%lu] => [\n", inp->offset);
+  input_next(inp, 2);
+
+  lisp_t *lvec = make_vec(ctx, 0);
+  vec_t *vec   = as_vec(lvec);
+
+  while (!input_eof(inp) && input_peek(inp, 0) != ')')
+  {
+    lisp_t *lisp = NIL;
+    perr_t res   = parse(ctx, inp, &lisp);
+    if (res)
+      return res;
+
+    vec_append(&ctx->memory, vec, &lisp, sizeof(lisp));
+    input_skip(inp);
+  }
+
+  if (input_peek(inp, 0) != ')')
+    return PERR_EXPECTED_CLOSE_BRACKET;
+
+  input_next(inp, 1);
+
+  debug("]\n");
+  *ret = lvec;
+  return PERR_OK;
+}
+
+perr_t parse_str(context_t *ctx, input_t *inp, lisp_t **ret)
+{
+  debug("parse_str[%lu] => ", inp->offset);
+  input_next(inp, 1); // 1 for the first speechmark
+  sv_t sv  = sv_cut(inp->str, inp->offset);
+  i64 size = sv_find_subcstr(sv, "\"", 1);
+  if (size < 0)
+    return PERR_EXPECTED_SPEECH_MARK;
+
+  input_next(inp, size + 1); // 1 for that last speechmark
+  sv_t str_content = sv_chop(sv, size);
+  debug("\"" PR_SV "\"\n", SV_FMT(str_content));
+  *ret = make_str(ctx, str_content.data, str_content.size);
+  return PERR_OK;
+}
+
+perr_t parse_quote(context_t *ctx, input_t *inp, lisp_t **ret)
+{
+  char c = input_peek(inp, 0);
+  if (!(c == '\'' || c == '`'))
+    return PERR_UNEXPECTED_CHAR;
+  input_next(inp, 1);
+  sv_t prefix = {0};
+  if (c == '\'')
+    prefix = SV("quote", 5);
+  else if (c == '`')
+    prefix = SV("quasiquote", 10);
+  lisp_t *root = make_cons(ctx, make_sym(ctx, prefix.data, prefix.size), NIL);
+  lisp_t *rest = NIL;
+  perr_t perr  = parse(ctx, inp, &rest);
+  if (perr)
+    return perr;
+  CDR(root) = make_cons(ctx, rest, NIL);
+  *ret      = root;
+  return PERR_OK;
+}
+
+// TODO: Make this interactable with user once we have evaluation
+perr_t parse_reader_macro(context_t *ctx, input_t *inp, lisp_t **ret)
+{
+  char c = input_peek(inp, 1);
+  if (c == '\\')
+  {
+    // character or weird base integer
+    TODO("Not implemented reader macro for characters or weird bases");
+  }
+  else if (c == '(')
+    return parse_vec(ctx, inp, ret);
+  else if (c == 't' || c == 'f')
+    return parse_bool(ctx, inp, ret);
+  else if (c == 'e')
+  {
+    // Scientific notation for floats
+  }
+  return PERR_UNEXPECTED_READER_MACRO_SYMBOL;
+}
+
+static_assert(NUM_TAGS == 9);
+perr_t parse(context_t *ctx, input_t *inp, lisp_t **ret)
+{
+  debug("parse => ");
+  input_skip(inp);
+  if (input_eof(inp))
+    return PERR_EOF;
+
+  char c = input_peek(inp, 0);
+
+  if (is_digit(c) || (c == '-' && is_digit(input_peek(inp, 1))))
+    return parse_int(ctx, inp, ret);
+  else if (c == '#')
+    return parse_reader_macro(ctx, inp, ret);
+  else if (is_sym(c))
+    return parse_sym(ctx, inp, ret);
+  else if (c == '(')
+    return parse_cons(ctx, inp, ret);
+  else if (c == '\'' || c == '`')
+    return parse_quote(ctx, inp, ret);
+  else if (c == '\"')
+    return parse_str(ctx, inp, ret);
+  else
+    return PERR_UNEXPECTED_CHAR;
+}
+
+perr_t parse_all(context_t *ctx, input_t *inp, vec_t *vec)
+{
+  while (!input_eof(inp))
+  {
+    lisp_t *member = NIL;
+    perr_t err     = parse(ctx, inp, &member);
+
+    if (err)
+      return err;
+    else
+      vec_append(&ctx->scratch, vec, &member, sizeof(member));
+
+    input_skip(inp);
+  }
+  return PERR_OK;
+}
+
+int print_perror(FILE *fp, input_t *inp, perr_t error)
+{
+  pos_t pos = input_offset_to_pos(inp);
+  fprintf(fp, "%s:%lu:%lu: %s", inp->name, pos.line, pos.col,
+          perr_to_cstr(error));
+  switch (error)
+  {
+  case PERR_UNEXPECTED_CHAR:
+    fprintf(fp, "(`%c`)", input_peek(inp, 0));
+    break;
+  case PERR_OK:
+  case PERR_EOF:
+  case PERR_EXPECTED_BOOLEAN:
+  case PERR_UNEXPECTED_READER_MACRO_SYMBOL:
+  case PERR_EXPECTED_CLOSE_BRACKET:
+  case PERR_EXPECTED_SPEECH_MARK:
+  default:
+    break;
+  }
+  fprintf(stderr, "\n");
+
+  return error;
+}
+
+pos_t input_offset_to_pos(input_t *inp)
+{
+  pos_t pos = {.col = 1, .line = 1};
+  for (u64 i = 0; i < inp->offset && i < inp->str.size; ++i)
+  {
+    char c = (inp->str.data[i]);
+    if (c == '\n')
+    {
+      ++pos.line;
+      pos.col = 1;
+    }
+    else
+    {
+      ++pos.col;
+    }
+  }
+  return pos;
+}