parser -> lexer

That's the real purpose of this module; it's not really generating an AST since ARL's syntax isn't tree like whatsoever. The next stage will be something closer to an AST, in the sense we'll be introducing: - Syntactical analysis - Type Checking
2026-01-29 03:43:04 +00:00
parent 42ac4f6bbb
commit dc96e12145
8 changed files with 259 additions and 261 deletions
--- a/src/lexer/lexer.c
+++ b/src/lexer/lexer.c
@@ -0,0 +1,184 @@
+/* lexr.c: Implementation of lexr.
+ * Created: 2026-01-22
+ * Author: Aryadev Chavali
+ * License: See end of file
+ * Commentary: See /include/arl/lexr/lexr.h
+ */
+
+#include <ctype.h>
+#include <string.h>
+
+#include <arl/lexer/lexer.h>
+#include <arl/lexer/token.h>
+#include <arl/lib/sv.h>
+
+/// Expected characters in a symbol
+static const char *SYMBOL_CHARS =
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&'()*+,-./"
+    ":;<=>?@\\^_`{|}~0123456789";
+
+const char *lex_err_to_string(lex_err_t err)
+{
+  switch (err)
+  {
+  case LEX_ERR_OK:
+    return "OK";
+  case LEX_ERR_EXPECTED_SPEECH_MARKS:
+    return "EXPECTED_SPEECH_MARKS";
+  case LEX_ERR_UNKNOWN_CHAR:
+    return "UNKNOWN_CHAR";
+  default:
+    FAIL("Unexpected lex_err_t value: %d\n", err);
+  }
+}
+
+/// Prototypes for streams
+bool stream_eos(lex_stream_t *stream);
+char stream_peek(lex_stream_t *stream);
+void stream_advance(lex_stream_t *stream, u64 size);
+u64 stream_size(lex_stream_t *stream);
+
+void lex_stream_get_line_col(lex_stream_t *stream, u64 *line, u64 *col)
+{
+  assert(stream && line && col && "Expected valid pointers.");
+  for (u64 i = 0; i < stream->byte; ++i)
+  {
+    char c = stream->contents.data[i];
+    if (c == '\n')
+    {
+      *line += 1;
+      *col = 0;
+    }
+    else
+    {
+      *col += 1;
+    }
+  }
+}
+
+/// Prototypes for lexing subroutines
+lex_err_t lex_string(lex_stream_t *stream, token_t *ret);
+lex_err_t lex_symbol(lex_stream_t *stream, token_t *ret);
+
+lex_err_t lex_stream(token_stream_t *out, lex_stream_t *stream)
+{
+  assert(out && stream && "Expected valid pointers");
+  while (!stream_eos(stream))
+  {
+    char cur = stream_peek(stream);
+    if (isspace(cur))
+    {
+      while (isspace(cur) && !stream_eos(stream))
+      {
+        stream_advance(stream, 1);
+        cur = stream_peek(stream);
+      }
+    }
+    else if (cur == '"')
+    {
+      // we make a copy for lex_string to mess with
+      token_t ret    = {0};
+      lex_err_t perr = lex_string(stream, &ret);
+      if (perr)
+        return perr;
+      vec_append(&out->vec, &ret, sizeof(ret));
+    }
+    else if (strchr(SYMBOL_CHARS, cur) && !isdigit(cur))
+    {
+      // we make a copy for lex_symbol to mess with
+      token_t ret    = {0};
+      lex_err_t perr = lex_symbol(stream, &ret);
+      if (perr)
+        return perr;
+
+      vec_append(&out->vec, &ret, sizeof(ret));
+    }
+    else
+    {
+      return LEX_ERR_UNKNOWN_CHAR;
+    }
+  }
+  return LEX_ERR_OK;
+}
+
+lex_err_t lex_string(lex_stream_t *stream, token_t *ret)
+{
+  // Increment the cursor just past the first speechmark
+  stream_advance(stream, 1);
+  sv_t string = sv_chop_left(stream->contents, stream->byte);
+  string.size = sv_till(string, "\"");
+
+  // If we're at the edge of the stream, there must not have been any
+  // speechmarks.
+  if (string.size + stream->byte == stream_size(stream))
+    return LEX_ERR_EXPECTED_SPEECH_MARKS;
+
+  // `string` is well defined, package and throw it back.
+  *ret = token_string(stream->byte - 1, string);
+  stream_advance(stream, string.size + 1);
+  return LEX_ERR_OK;
+}
+
+lex_err_t lex_symbol(lex_stream_t *stream, token_t *ret)
+{
+  sv_t symbol = sv_chop_left(stream->contents, stream->byte);
+  symbol.size = sv_while(symbol, SYMBOL_CHARS);
+
+  // see if symbol is one of the already known symbols
+  static_assert(NUM_TOKEN_KNOWNS == 1, "Expected number of TOKEN_KNOWNs");
+  for (token_known_t i = 0; i < NUM_TOKEN_KNOWNS; ++i)
+  {
+    const char *possible_known = token_known_to_cstr(i);
+    if (strlen(possible_known) == symbol.size &&
+        strncmp(possible_known, symbol.data, symbol.size) == 0)
+    {
+      // Found a matching known symbol
+      *ret = token_known(stream->byte, i);
+      goto end;
+    }
+  }
+
+  // otherwise, it must be a fresh symbol i.e. user defined
+  *ret = token_symbol(stream->byte, symbol);
+end:
+  stream_advance(stream, symbol.size);
+  return LEX_ERR_OK;
+}
+
+bool stream_eos(lex_stream_t *stream)
+{
+  return stream->byte >= stream->contents.size;
+}
+
+char stream_peek(lex_stream_t *stream)
+{
+  if (stream_eos(stream))
+    return '\0';
+  else
+    return stream->contents.data[stream->byte];
+}
+
+void stream_advance(lex_stream_t *stream, u64 size)
+{
+  if (stream->byte + size >= stream->contents.size)
+    stream->byte = stream->contents.size;
+  else
+    stream->byte += size;
+}
+
+u64 stream_size(lex_stream_t *stream)
+{
+  return stream->contents.size;
+}
+
+/* Copyright (C) 2026 Aryadev Chavali
+
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the MIT License for details.
+
+ * You may distribute and modify this code under the terms of the MIT License,
+ * which you should have received a copy of along with this program.  If not,
+ * please go to <https://opensource.org/license/MIT>.
+
+ */
--- a/src/lexer/token.c
+++ b/src/lexer/token.c
@@ -0,0 +1,115 @@
+/* token.c: Implementation of TOKEN constructor/destructor functions
+ * Created: 2026-01-22
+ * Author: Aryadev Chavali
+ * License: See end of file
+ * Commentary: See /include/arl/lexer/token.h.
+ */
+
+#include <arl/lexer/token.h>
+#include <arl/lib/base.h>
+#include <arl/lib/vec.h>
+
+const char *token_known_to_cstr(token_known_t known)
+{
+  switch (known)
+  {
+  case TOKEN_KNOWN_PUTSTR:
+    return "putstr";
+  default:
+    FAIL("Unexpected TOKEN_KNOWN value: %d\n", known);
+  }
+}
+
+token_t token_known(u64 byte, token_known_t known)
+{
+  return (token_t){
+      .byte_location = byte,
+      .type          = TOKEN_TYPE_KNOWN,
+      .as_known      = known,
+  };
+}
+
+token_t token_string(u64 byte, sv_t string)
+{
+  return (token_t){
+      .byte_location = byte,
+      .type          = TOKEN_TYPE_STRING,
+      .as_string     = string,
+  };
+}
+
+token_t token_symbol(u64 byte, sv_t symbol)
+{
+  return (token_t){
+      .byte_location = byte,
+      .type          = TOKEN_TYPE_SYMBOL,
+      .as_symbol     = symbol,
+  };
+}
+
+void token_print(FILE *fp, token_t *token)
+{
+  if (!token)
+  {
+    fprintf(fp, "NIL");
+    return;
+  }
+  switch (token->type)
+  {
+  case TOKEN_TYPE_KNOWN:
+    fprintf(fp, "KNOWN(%s)", token_known_to_cstr(token->as_known));
+    break;
+  case TOKEN_TYPE_SYMBOL:
+    fprintf(fp, "SYMBOL(" PR_SV ")", SV_FMT(token->as_symbol));
+    break;
+  case TOKEN_TYPE_STRING:
+    fprintf(fp, "STRING(" PR_SV ")", SV_FMT(token->as_string));
+    break;
+  case NUM_TOKEN_TYPES:
+  default:
+    FAIL("Unexpected token type: %d\n", token->type);
+  }
+}
+
+void token_stream_print(FILE *fp, token_stream_t *token)
+{
+  if (!token)
+  {
+    fprintf(fp, "{}");
+    return;
+  }
+  fprintf(fp, "{");
+  if (token->vec.size == 0)
+  {
+    fprintf(fp, "}\n");
+    return;
+  }
+
+  fprintf(fp, "\n");
+  for (u64 i = 0; i < token->vec.size / sizeof(token_t); ++i)
+  {
+    token_t item = VEC_GET(&token->vec, i, token_t);
+    fprintf(fp, "\t[%lu]: ", i);
+    token_print(fp, &item);
+    fprintf(fp, "\n");
+  }
+  fprintf(fp, "}");
+}
+
+void token_stream_free(token_stream_t *stream)
+{
+  // we can free the vector itself and we're done
+  vec_free(&stream->vec);
+}
+
+/* Copyright (C) 2026 Aryadev Chavali
+
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the MIT License for details.
+
+ * You may distribute and modify this code under the terms of the MIT License,
+ * which you should have received a copy of along with this program.  If not,
+ * please go to <https://opensource.org/license/MIT>.
+
+ */