parser -> lexer

That's the real purpose of this module; it's not really generating an AST since ARL's syntax isn't tree like whatsoever. The next stage will be something closer to an AST, in the sense we'll be introducing: - Syntactical analysis - Type Checking
2026-01-29 03:43:04 +00:00
parent 42ac4f6bbb
commit dc96e12145
8 changed files with 259 additions and 261 deletions
--- a/4
+++ b/4
@@ -3,8 +3,8 @@ CC=cc
 DIST=build
 OUT=$(DIST)/arl.out
-MODULES=. lib parser
+MODULES=. lib lexer
-UNITS=main lib/vec lib/sv parser/ast parser/parser
+UNITS=main lib/vec lib/sv lexer/token lexer/lexer
 OBJECTS:=$(patsubst %,$(DIST)/%.o, $(UNITS))
 LDFLAGS=
--- a/include/arl/parser/parser.h
+++ b/include/arl/parser/parser.h
@@ -1,38 +1,38 @@
-/* parser.h: Parser which takes character buffers and yields an AST
+/* lexer.h: Lexer which takes character buffers and yields a sequence of tokens.
 * Created: 2026-01-22
 * Author: Aryadev Chavali
 * License: See end of file
 * Commentary:
 */
-#ifndef PARSER_H
+#ifndef LEXER_H
-#define PARSER_H
+#define LEXER_H
-#include <arl/parser/ast.h>
+#include <arl/lexer/token.h>
-/// Parser streams, utilised when generating an AST.
+/// Token streams, utilised when lexing.
 typedef struct
 {
  u64 byte;
  sv_t contents;
-} parse_stream_t;
+} lex_stream_t;
-/// Types of errors that may occur during parsing
+/// Types of errors that may occur during lexing
 typedef enum
 {
-  PARSE_ERR_OK = 0,
+  LEX_ERR_OK = 0,
-  PARSE_ERR_EXPECTED_SPEECH_MARKS,
+  LEX_ERR_EXPECTED_SPEECH_MARKS,
-  PARSE_ERR_UNKNOWN_CHAR,
+  LEX_ERR_UNKNOWN_CHAR,
-} parse_err_t;
+} lex_err_t;
-const char *parse_err_to_string(parse_err_t err);
+const char *lex_err_to_string(lex_err_t err);
-// Generates an AST from STREAM, storing it in OUT.  Returns any errors it may
+// Generates a token stream from a lex_stream_t, storing it in OUT.  Returns any
-// generate.
+// errors it may generate.
-parse_err_t parse(ast_t *out, parse_stream_t *stream);
+lex_err_t lex_stream(token_stream_t *out, lex_stream_t *stream);
 // Computes the line and column that STREAM is currently pointing at in its
 // buffer, storing it in LINE and COL.
-void parse_stream_get_line_col(parse_stream_t *stream, u64 *line, u64 *col);
+void lex_stream_get_line_col(lex_stream_t *stream, u64 *line, u64 *col);
 #endif
--- a/include/arl/lexer/token.h
+++ b/include/arl/lexer/token.h
@@ -0,0 +1,73 @@
 /* token.h: General definition of tokens, and a sequence of them.
 * Created: 2026-01-22
 * Author: Aryadev Chavali
 * License: See end of file
 * Commentary:
 */
 #ifndef TOKEN_H
 #define TOKEN_H
 #include <arl/lib/base.h>
 #include <arl/lib/sv.h>
 #include <arl/lib/vec.h>
 /// Types of tokens
 typedef enum
 {
  TOKEN_TYPE_KNOWN = 0,
  TOKEN_TYPE_SYMBOL,
  TOKEN_TYPE_STRING,
  NUM_TOKEN_TYPES,
 } token_type_t;
 /// Known symbols which later stages would benefit from.
 typedef enum
 {
  TOKEN_KNOWN_PUTSTR,
  NUM_TOKEN_KNOWNS,
 } token_known_t;
 const char *token_known_to_cstr(token_known_t);
 /// Tokens are a tagged union
 typedef struct
 {
  u64 byte_location;
  token_type_t type;
  union
  {
    token_known_t as_known;
    sv_t as_symbol;
    sv_t as_string;
  };
 } token_t;
 token_t token_known(u64 byte, token_known_t known);
 token_t token_symbol(u64 byte, sv_t symbol);
 token_t token_string(u64 byte, sv_t string);
 void token_print(FILE *fp, token_t *token);
 /// Sequence of tokens
 typedef struct
 {
  vec_t vec;
 } token_stream_t;
 void token_stream_free(token_stream_t *token);
 void token_stream_print(FILE *fp, token_stream_t *token);
 #endif
 /* Copyright (C) 2026 Aryadev Chavali
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE.  See the MIT License for details.
 * You may distribute and modify this code under the terms of the MIT License,
 * which you should have received a copy of along with this program.  If not,
 * please go to <https://opensource.org/license/MIT>.
 */
--- a/include/arl/parser/ast.h
+++ b/include/arl/parser/ast.h
@@ -1,74 +0,0 @@
 /* ast.h: General definition of the AST and nodes within it.
 * Created: 2026-01-22
 * Author: Aryadev Chavali
 * License: See end of file
 * Commentary:
 */
 #ifndef AST_H
 #define AST_H
 #include <arl/lib/base.h>
 #include <arl/lib/sv.h>
 #include <arl/lib/vec.h>
 /// Types the AST can encode
 typedef enum
 {
  AST_NODE_TYPE_KNOWN = 0,
  AST_NODE_TYPE_SYMBOL,
  AST_NODE_TYPE_STRING,
  NUM_AST_NODE_TYPES,
 } ast_node_type_t;
 /// Known symbols - may reference callables or values.
 typedef enum
 {
  AST_KNOWN_PUTSTR,
  NUM_AST_KNOWNS,
 } ast_known_t;
 const char *ast_known_to_cstr(ast_known_t);
 /// Node of the AST as a tagged union
 typedef struct
 {
  u64 byte_location;
  ast_node_type_t type;
  union
  {
    ast_known_t as_known;
    sv_t as_symbol;
    sv_t as_string;
  };
 } ast_node_t;
 ast_node_t ast_node_known(u64 byte, ast_known_t known);
 ast_node_t ast_node_symbol(u64 byte, sv_t symbol);
 ast_node_t ast_node_string(u64 byte, sv_t string);
 void ast_node_print(FILE *fp, ast_node_t *node);
 /// The AST as a flat collection of nodes
 typedef struct
 {
  vec_t nodes;
 } ast_t;
 void ast_free(ast_t *ast);
 void ast_print(FILE *fp, ast_t *ast);
 #endif
 /* Copyright (C) 2026 Aryadev Chavali
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE.  See the MIT License for details.
 * You may distribute and modify this code under the terms of the MIT License,
 * which you should have received a copy of along with this program.  If not,
 * please go to <https://opensource.org/license/MIT>.
 */
--- a/src/parser/parser.c
+++ b/src/parser/parser.c
@@ -1,44 +1,44 @@
-/* parser.c: Implementation of parser.
+/* lexr.c: Implementation of lexr.
 * Created: 2026-01-22
 * Author: Aryadev Chavali
 * License: See end of file
- * Commentary: See /include/arl/parser/parser.h
+ * Commentary: See /include/arl/lexr/lexr.h
 */
 #include <ctype.h>
 #include <string.h>
 #include <arl/lexer/lexer.h>
 #include <arl/lexer/token.h>
 #include <arl/lib/sv.h>
 #include <arl/parser/ast.h>
 #include <arl/parser/parser.h>
 /// Expected characters in a symbol
 static const char *SYMBOL_CHARS =
    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&'()*+,-./"
    ":;<=>?@\\^_`{|}~0123456789";
-const char *parse_err_to_string(parse_err_t err)
+const char *lex_err_to_string(lex_err_t err)
 {
  switch (err)
  {
-  case PARSE_ERR_OK:
+  case LEX_ERR_OK:
    return "OK";
-  case PARSE_ERR_EXPECTED_SPEECH_MARKS:
+  case LEX_ERR_EXPECTED_SPEECH_MARKS:
    return "EXPECTED_SPEECH_MARKS";
-  case PARSE_ERR_UNKNOWN_CHAR:
+  case LEX_ERR_UNKNOWN_CHAR:
    return "UNKNOWN_CHAR";
  default:
-    FAIL("Unexpected parse_err_t value: %d\n", err);
+    FAIL("Unexpected lex_err_t value: %d\n", err);
  }
 }
 /// Prototypes for streams
-bool stream_eos(parse_stream_t *stream);
+bool stream_eos(lex_stream_t *stream);
-char stream_peek(parse_stream_t *stream);
+char stream_peek(lex_stream_t *stream);
-void stream_advance(parse_stream_t *stream, u64 size);
+void stream_advance(lex_stream_t *stream, u64 size);
-u64 stream_size(parse_stream_t *stream);
+u64 stream_size(lex_stream_t *stream);
-void parse_stream_get_line_col(parse_stream_t *stream, u64 *line, u64 *col)
+void lex_stream_get_line_col(lex_stream_t *stream, u64 *line, u64 *col)
 {
  assert(stream && line && col && "Expected valid pointers.");
  for (u64 i = 0; i < stream->byte; ++i)
@@ -56,11 +56,11 @@ void parse_stream_get_line_col(parse_stream_t *stream, u64 *line, u64 *col)
  }
 }
-/// Prototypes for parsing subroutines
+/// Prototypes for lexing subroutines
-parse_err_t parse_string(parse_stream_t *stream, ast_node_t *ret);
+lex_err_t lex_string(lex_stream_t *stream, token_t *ret);
-parse_err_t parse_symbol(parse_stream_t *stream, ast_node_t *ret);
+lex_err_t lex_symbol(lex_stream_t *stream, token_t *ret);
-parse_err_t parse(ast_t *out, parse_stream_t *stream)
+lex_err_t lex_stream(token_stream_t *out, lex_stream_t *stream)
 {
  assert(out && stream && "Expected valid pointers");
  while (!stream_eos(stream))
@@ -76,32 +76,32 @@ parse_err_t parse(ast_t *out, parse_stream_t *stream)
    }
    else if (cur == '"')
    {
-      // we make a copy for parse_string to mess with
+      // we make a copy for lex_string to mess with
-      ast_node_t ret   = {0};
+      token_t ret    = {0};
-      parse_err_t perr = parse_string(stream, &ret);
+      lex_err_t perr = lex_string(stream, &ret);
      if (perr)
        return perr;
-      vec_append(&out->nodes, &ret, sizeof(ret));
+      vec_append(&out->vec, &ret, sizeof(ret));
    }
    else if (strchr(SYMBOL_CHARS, cur) && !isdigit(cur))
    {
-      // we make a copy for parse_symbol to mess with
+      // we make a copy for lex_symbol to mess with
-      ast_node_t ret   = {0};
+      token_t ret    = {0};
-      parse_err_t perr = parse_symbol(stream, &ret);
+      lex_err_t perr = lex_symbol(stream, &ret);
      if (perr)
        return perr;
-      vec_append(&out->nodes, &ret, sizeof(ret));
+      vec_append(&out->vec, &ret, sizeof(ret));
    }
    else
    {
-      return PARSE_ERR_UNKNOWN_CHAR;
+      return LEX_ERR_UNKNOWN_CHAR;
    }
  }
-  return PARSE_ERR_OK;
+  return LEX_ERR_OK;
 }
-parse_err_t parse_string(parse_stream_t *stream, ast_node_t *ret)
+lex_err_t lex_string(lex_stream_t *stream, token_t *ret)
 {
  // Increment the cursor just past the first speechmark
  stream_advance(stream, 1);
@@ -111,46 +111,46 @@ parse_err_t parse_string(parse_stream_t *stream, ast_node_t *ret)
  // If we're at the edge of the stream, there must not have been any
  // speechmarks.
  if (string.size + stream->byte == stream_size(stream))
-    return PARSE_ERR_EXPECTED_SPEECH_MARKS;
+    return LEX_ERR_EXPECTED_SPEECH_MARKS;
  // `string` is well defined, package and throw it back.
-  *ret = ast_node_string(stream->byte - 1, string);
+  *ret = token_string(stream->byte - 1, string);
  stream_advance(stream, string.size + 1);
-  return PARSE_ERR_OK;
+  return LEX_ERR_OK;
 }
-parse_err_t parse_symbol(parse_stream_t *stream, ast_node_t *ret)
+lex_err_t lex_symbol(lex_stream_t *stream, token_t *ret)
 {
  sv_t symbol = sv_chop_left(stream->contents, stream->byte);
  symbol.size = sv_while(symbol, SYMBOL_CHARS);
  // see if symbol is one of the already known symbols
-  static_assert(NUM_AST_KNOWNS == 1, "Expected number of AST_KNOWNs");
+  static_assert(NUM_TOKEN_KNOWNS == 1, "Expected number of TOKEN_KNOWNs");
-  for (ast_known_t i = 0; i < NUM_AST_KNOWNS; ++i)
+  for (token_known_t i = 0; i < NUM_TOKEN_KNOWNS; ++i)
  {
-    const char *possible_known = ast_known_to_cstr(i);
+    const char *possible_known = token_known_to_cstr(i);
    if (strlen(possible_known) == symbol.size &&
        strncmp(possible_known, symbol.data, symbol.size) == 0)
    {
      // Found a matching known symbol
-      *ret = ast_node_known(stream->byte, i);
+      *ret = token_known(stream->byte, i);
      goto end;
    }
  }
  // otherwise, it must be a fresh symbol i.e. user defined
-  *ret = ast_node_symbol(stream->byte, symbol);
+  *ret = token_symbol(stream->byte, symbol);
 end:
  stream_advance(stream, symbol.size);
-  return PARSE_ERR_OK;
+  return LEX_ERR_OK;
 }
-bool stream_eos(parse_stream_t *stream)
+bool stream_eos(lex_stream_t *stream)
 {
  return stream->byte >= stream->contents.size;
 }
-char stream_peek(parse_stream_t *stream)
+char stream_peek(lex_stream_t *stream)
 {
  if (stream_eos(stream))
    return '\0';
@@ -158,7 +158,7 @@ char stream_peek(parse_stream_t *stream)
    return stream->contents.data[stream->byte];
 }
-void stream_advance(parse_stream_t *stream, u64 size)
+void stream_advance(lex_stream_t *stream, u64 size)
 {
  if (stream->byte + size >= stream->contents.size)
    stream->byte = stream->contents.size;
@@ -166,7 +166,7 @@ void stream_advance(parse_stream_t *stream, u64 size)
    stream->byte += size;
 }
-u64 stream_size(parse_stream_t *stream)
+u64 stream_size(lex_stream_t *stream)
 {
  return stream->contents.size;
 }
--- a/src/lexer/token.c
+++ b/src/lexer/token.c
@@ -0,0 +1,115 @@
 /* token.c: Implementation of TOKEN constructor/destructor functions
 * Created: 2026-01-22
 * Author: Aryadev Chavali
 * License: See end of file
 * Commentary: See /include/arl/lexer/token.h.
 */
 #include <arl/lexer/token.h>
 #include <arl/lib/base.h>
 #include <arl/lib/vec.h>
 const char *token_known_to_cstr(token_known_t known)
 {
  switch (known)
  {
  case TOKEN_KNOWN_PUTSTR:
    return "putstr";
  default:
    FAIL("Unexpected TOKEN_KNOWN value: %d\n", known);
  }
 }
 token_t token_known(u64 byte, token_known_t known)
 {
  return (token_t){
      .byte_location = byte,
      .type          = TOKEN_TYPE_KNOWN,
      .as_known      = known,
  };
 }
 token_t token_string(u64 byte, sv_t string)
 {
  return (token_t){
      .byte_location = byte,
      .type          = TOKEN_TYPE_STRING,
      .as_string     = string,
  };
 }
 token_t token_symbol(u64 byte, sv_t symbol)
 {
  return (token_t){
      .byte_location = byte,
      .type          = TOKEN_TYPE_SYMBOL,
      .as_symbol     = symbol,
  };
 }
 void token_print(FILE *fp, token_t *token)
 {
  if (!token)
  {
    fprintf(fp, "NIL");
    return;
  }
  switch (token->type)
  {
  case TOKEN_TYPE_KNOWN:
    fprintf(fp, "KNOWN(%s)", token_known_to_cstr(token->as_known));
    break;
  case TOKEN_TYPE_SYMBOL:
    fprintf(fp, "SYMBOL(" PR_SV ")", SV_FMT(token->as_symbol));
    break;
  case TOKEN_TYPE_STRING:
    fprintf(fp, "STRING(" PR_SV ")", SV_FMT(token->as_string));
    break;
  case NUM_TOKEN_TYPES:
  default:
    FAIL("Unexpected token type: %d\n", token->type);
  }
 }
 void token_stream_print(FILE *fp, token_stream_t *token)
 {
  if (!token)
  {
    fprintf(fp, "{}");
    return;
  }
  fprintf(fp, "{");
  if (token->vec.size == 0)
  {
    fprintf(fp, "}\n");
    return;
  }
  fprintf(fp, "\n");
  for (u64 i = 0; i < token->vec.size / sizeof(token_t); ++i)
  {
    token_t item = VEC_GET(&token->vec, i, token_t);
    fprintf(fp, "\t[%lu]: ", i);
    token_print(fp, &item);
    fprintf(fp, "\n");
  }
  fprintf(fp, "}");
 }
 void token_stream_free(token_stream_t *stream)
 {
  // we can free the vector itself and we're done
  vec_free(&stream->vec);
 }
 /* Copyright (C) 2026 Aryadev Chavali
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE.  See the MIT License for details.
 * You may distribute and modify this code under the terms of the MIT License,
 * which you should have received a copy of along with this program.  If not,
 * please go to <https://opensource.org/license/MIT>.
 */
--- a/src/main.c
+++ b/src/main.c
@@ -12,11 +12,11 @@
 #include <stdlib.h>
 #include <string.h>
 #include <arl/lexer/lexer.h>
 #include <arl/lexer/token.h>
 #include <arl/lib/base.h>
 #include <arl/lib/sv.h>
 #include <arl/lib/vec.h>
 #include <arl/parser/ast.h>
 #include <arl/parser/parser.h>
 int read_file(const char *filename, sv_t *ret)
 {
@@ -114,30 +114,29 @@ int main(int argc, char *argv[])
  LOG("%s => `" PR_SV "`\n", filename, SV_FMT(contents));
-  parse_stream_t stream = {.byte = 0, .contents = contents};
+  lex_stream_t stream   = {.byte = 0, .contents = contents};
-  ast_t ast             = {0};
+  token_stream_t tokens = {0};
-  parse_err_t perr      = parse(&ast, &stream);
+  lex_err_t perr        = lex_stream(&tokens, &stream);
  if (perr)
  {
    u64 line = 1, col = 0;
-    parse_stream_get_line_col(&stream, &line, &col);
+    lex_stream_get_line_col(&stream, &line, &col);
-    LOG_ERR("%s:%lu:%lu: %s\n", filename, line, col, parse_err_to_string(perr));
+    LOG_ERR("%s:%lu:%lu: %s\n", filename, line, col, lex_err_to_string(perr));
    ret = 1;
    goto end;
  }
-  LOG("Parsed %lu nodes\n", ast.nodes.size / sizeof(ast_node_t));
+  LOG("Lexed %lu tokens\n", tokens.vec.size / sizeof(token_t));
 #if VERBOSE_LOGS
-  ast_print(stdout, &ast);
+  token_stream_print(stdout, &tokens);
 #endif
  printf("\n");
 end:
  if (contents.data)
    free(contents.data);
-  if (ast.nodes.capacity > 0)
+  token_stream_free(&tokens);
    ast_free(&ast);
  return ret;
 }
--- a/src/parser/ast.c
+++ b/src/parser/ast.c
@@ -1,115 +0,0 @@
 /* ast.c: Implementation of AST constructor/destructor functions
 * Created: 2026-01-22
 * Author: Aryadev Chavali
 * License: See end of file
 * Commentary: See /include/arl/parser/ast.h.
 */
 #include <arl/lib/base.h>
 #include <arl/lib/vec.h>
 #include <arl/parser/ast.h>
 const char *ast_known_to_cstr(ast_known_t known)
 {
  switch (known)
  {
  case AST_KNOWN_PUTSTR:
    return "putstr";
  default:
    FAIL("Unexpected AST_KNOWN value: %d\n", known);
  }
 }
 ast_node_t ast_node_known(u64 byte, ast_known_t known)
 {
  return (ast_node_t){
      .byte_location = byte,
      .type          = AST_NODE_TYPE_KNOWN,
      .as_known      = known,
  };
 }
 ast_node_t ast_node_string(u64 byte, sv_t string)
 {
  return (ast_node_t){
      .byte_location = byte,
      .type          = AST_NODE_TYPE_STRING,
      .as_string     = string,
  };
 }
 ast_node_t ast_node_symbol(u64 byte, sv_t symbol)
 {
  return (ast_node_t){
      .byte_location = byte,
      .type          = AST_NODE_TYPE_SYMBOL,
      .as_symbol     = symbol,
  };
 }
 void ast_node_print(FILE *fp, ast_node_t *node)
 {
  if (!node)
  {
    fprintf(fp, "NIL");
    return;
  }
  switch (node->type)
  {
  case AST_NODE_TYPE_KNOWN:
    fprintf(fp, "KNOWN(%s)", ast_known_to_cstr(node->as_known));
    break;
  case AST_NODE_TYPE_SYMBOL:
    fprintf(fp, "SYMBOL(" PR_SV ")", SV_FMT(node->as_symbol));
    break;
  case AST_NODE_TYPE_STRING:
    fprintf(fp, "STRING(" PR_SV ")", SV_FMT(node->as_string));
    break;
  case NUM_AST_NODE_TYPES:
  default:
    FAIL("Unexpected node type: %d\n", node->type);
  }
 }
 void ast_print(FILE *fp, ast_t *ast)
 {
  if (!ast)
  {
    fprintf(fp, "{}");
    return;
  }
  fprintf(fp, "{");
  if (ast->nodes.size == 0)
  {
    fprintf(fp, "}\n");
    return;
  }
  fprintf(fp, "\n");
  for (u64 i = 0; i < ast->nodes.size / sizeof(ast_node_t); ++i)
  {
    ast_node_t item = VEC_GET(&ast->nodes, i, ast_node_t);
    fprintf(fp, "\t[%lu]: ", i);
    ast_node_print(fp, &item);
    fprintf(fp, "\n");
  }
  fprintf(fp, "}");
 }
 void ast_free(ast_t *ast)
 {
  // we can free the vector itself and we're done
  vec_free(&ast->nodes);
 }
 /* Copyright (C) 2026 Aryadev Chavali
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE.  See the MIT License for details.
 * You may distribute and modify this code under the terms of the MIT License,
 * which you should have received a copy of along with this program.  If not,
 * please go to <https://opensource.org/license/MIT>.
 */