A reworked preprocesser with focus on stopping recursive errors

Preprocesser requires one function to use: preprocess.  Takes Tokens
and gives back Units.

A unit is a tree of tokens, where each unit is a node in that tree.  A
unit has a "root" token (value of node) and an "expansion" (children
of node) where the root is some preprocesser token (such as a
reference or USE call) and the expansion is the tokens it yields.  In
the case of a USE call this is the tokens of the file it includes, in
the case of a reference it's the tokens of the constant it refers to.
This means that the leaves of the tree of units are the completely
preprocessed/expanded form of the source code.

The function has many working components, which may need to be
extracted.  In particular, the function ensures we don't include a
source twice through a hash map and that constants are not redefined
in inner include scopes if they're already defined in outer
scopes (i.e. if compiling a.asm which defines constant N, then include
b.asm which defines constant N, then N uses the definition of a.asm
rather than b.asm).

I need to make a spec for this.
This commit is contained in:
2024-07-06 17:38:02 +01:00
parent 1145b97c4c
commit a422c7d1dc
3 changed files with 354 additions and 1 deletions

View File

@@ -15,7 +15,7 @@ TERM_RESET:=$(shell echo -e "\e[0;0m")
# Setup variables for source code, output, etc
## ASSEMBLY setup
SRC=src
CODE:=$(addprefix $(SRC)/, base.cpp lexer.cpp)
CODE:=$(addprefix $(SRC)/, base.cpp lexer.cpp preprocesser.cpp)
OBJECTS:=$(CODE:$(SRC)/%.cpp=$(DIST)/%.o)
OUT=$(DIST)/asm.out

273
src/preprocesser.cpp Normal file
View File

@@ -0,0 +1,273 @@
/* Copyright (C) 2024 Aryadev Chavali
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License Version 2 for
* details.
* You may distribute and modify this code under the terms of the GNU General
* Public License Version 2, which you should have received a copy of along with
* this program. If not, please go to <https://www.gnu.org/licenses/>.
* Created: 2024-07-05
* Author: Aryadev Chavali
* Description:
*/
#include <src/base.hpp>
#include <src/lexer.hpp>
#include <src/preprocesser.hpp>
#include <iostream>
#include <sstream>
namespace Preprocesser
{
using TT = Lexer::Token::Type;
using ET = Err::Type;
using LET = Lexer::Err::Type;
Err *preprocess(std::vector<Lexer::Token *> tokens, std::vector<Unit> &units,
std::vector<Lexer::Token *> &new_token_bag, Map &const_map,
Map &file_map, int depth)
{
// Stop preprocessing if we've smashed the preprocessing call stack
if (depth >= PREPROCESSER_MAX_DEPTH)
return new Err{ET::EXCEEDED_PREPROCESSER_DEPTH, tokens[0]};
for (size_t i = 0; i < tokens.size(); ++i)
{
const auto token = tokens[i];
if (token->type == TT::PP_CONST)
{
if (i == tokens.size() - 1 || tokens[i + 1]->type != TT::SYMBOL)
return new Err{ET::EXPECTED_SYMBOL_FOR_NAME, token};
const auto const_name = tokens[i + 1]->content;
size_t end = 0;
for (end = i + 2;
end < tokens.size() && tokens[end]->type != TT::PP_END; ++end)
{
// TODO: Is there a better way to deal with preprocesser calls inside
// of a constant?
if (tokens[end]->type == TT::PP_CONST ||
tokens[end]->type == TT::PP_USE)
return new Err{ET::DIRECTIVES_IN_CONST_BODY, tokens[end]};
}
if (end == tokens.size())
return new Err{ET::EXPECTED_END, token};
else if (end - i == 2)
return new Err{ET::EMPTY_CONST, token};
// If this content is actually being included (depth > 0) by another
// file, check if the constant is already defined. If so, stop what
// we're doing and continue because user has technically redefined the
// constant. Implements a #ifndef guard automatically on included
// constants.
if (depth > 0 && const_map.find(const_name) != const_map.end())
{
i = end;
#if VERBOSE >= 2
std::cout << "[" TERM_YELLOW "PREPROCESSER" TERM_RESET "]: <" << depth
<< "> [" << i << "]:\n\tPreserving definition of `"
<< const_name << "` from outer scope\n";
#endif
continue;
}
std::vector<Lexer::Token *> body{end - i - 2};
std::copy(std::begin(tokens) + i + 2, std::begin(tokens) + end,
std::begin(body));
const_map[const_name] = {token, body};
i = end;
#if VERBOSE >= 2
std::cout << "[" TERM_YELLOW "PREPROCESSER" TERM_RESET "]: <" << depth
<< "> [" << i << "]:\n\tConstant `" << const_name << "` {\n";
for (size_t j = 0; j < body.size(); ++j)
{
std::cout << "\t\t[" << j << "]: ";
if (body[j])
std::cout << *body[j];
else
std::cout << "[NULL]";
std::cout << "\n";
}
std::cout << "\t}\n";
#endif
}
else if (token->type == TT::PP_USE)
{
// Ensure string in next token
if (i == tokens.size() - 1 || tokens[i + 1]->type != TT::LITERAL_STRING)
return new Err{ET::EXPECTED_FILE_NAME_AS_STRING, token};
// Stops recursive calls on the file currently being preprocessed
if (file_map.find(token->source_name) == file_map.end())
file_map[token->source_name] = {};
const auto name = tokens[i + 1]->content;
#if VERBOSE >= 2
std::cout << "[" TERM_YELLOW "PREPROCESSER" TERM_RESET "]: <" << depth
<< "> [" << i << "]: (" << *tokens[i] << "): FILENAME=`"
<< name << "`\n";
#endif
// If file has never been encountered, let's tokenise, preprocess then
// cache the result
if (file_map.find(name) == file_map.end())
{
auto content = read_file(tokens[i + 1]->content.c_str());
if (!content.has_value())
return new Err{ET::FILE_NON_EXISTENT, token};
std::vector<Lexer::Token *> body;
Lexer::Err lexer_err = Lexer::tokenise_buffer(tokens[i + 1]->content,
content.value(), body);
if (lexer_err.type != LET::OK)
return new Err{ET::IN_FILE_LEXING, token, nullptr, lexer_err};
// Here we add the tokens, freshly allocated, to the bag so we can
// free it later
new_token_bag.insert(std::end(new_token_bag), std::begin(body),
std::end(body));
file_map[name].body = body;
std::vector<Unit> body_units;
Err *err = preprocess(body, body_units, new_token_bag, const_map,
file_map, depth + 1);
// TODO: Introduce stack traces for this error (this error occurs in
// outside file that has use site in current file).
if (err)
return new Err{ET::IN_ERROR, token, err};
units.push_back(Unit{token, body_units});
++i;
}
// Otherwise file must be part of the source tree already, so skip this
// call
else
i += 1;
}
else if (token->type == TT::PP_REFERENCE)
{
// Reference expansion based on latest constant
const auto found = const_map.find(token->content);
if (found == const_map.end())
return new Err{ET::UNKNOWN_NAME_IN_REFERENCE, token};
std::vector<Unit> preprocessed;
Err *err = preprocess(found->second.body, preprocessed, new_token_bag,
const_map, file_map, depth + 1);
if (err)
return new Err{ET::IN_ERROR, token, err};
units.push_back(Unit{token, preprocessed});
}
else if (token->type == TT::PP_END)
return new Err{ET::NO_CONST_AROUND, token};
else
units.push_back(Unit{token, {}});
}
return nullptr;
}
std::string to_string(const Unit &unit, int depth)
{
std::stringstream ss;
for (int i = 0; i < depth; ++i)
ss << "\t";
ss << Lexer::to_string(*unit.root) << " => {";
if (unit.expansion.size() != 0)
{
ss << "\n";
for (auto child : unit.expansion)
ss << to_string(child, depth + 1) << "\n";
for (int i = 0; i < depth; ++i)
ss << "\t";
}
ss << "}";
return ss.str();
}
std::string to_string(const Err::Type &type)
{
switch (type)
{
case ET::EXPECTED_END:
return "EXPECTED_END";
case ET::EMPTY_CONST:
return "EMPTY_CONST";
case ET::NO_CONST_AROUND:
return "NO_CONST_AROUND";
case ET::EXPECTED_SYMBOL_FOR_NAME:
return "EXPECTED_SYMBOL_FOR_NAME";
case ET::DIRECTIVES_IN_CONST_BODY:
return "DIRECTIVES_IN_CONST_BODY";
case ET::UNKNOWN_NAME_IN_REFERENCE:
return "UNKNOWN_NAME_IN_REFERENCE";
case ET::EXPECTED_FILE_NAME_AS_STRING:
return "EXPECTED_FILE_NAME_AS_STRING";
case ET::FILE_NON_EXISTENT:
return "FILE_NON_EXISTENT";
case ET::IN_FILE_LEXING:
return "IN_FILE_LEXING";
case ET::SELF_RECURSIVE_USE_CALL:
return "SELF_RECURSIVE_USE_CALL";
case ET::IN_ERROR:
return "IN_ERROR";
case ET::EXCEEDED_PREPROCESSER_DEPTH:
return "EXCEEDED_PREPROCESSER_DEPTH";
default:
return "";
}
}
std::string to_string(const Err &err)
{
std::stringstream ss;
// Reverse traversal of err linked list
std::vector<Err *> errors;
errors.push_back((Err *)&err);
for (Err *e = err.child_error; e; e = e->child_error)
errors.insert(errors.begin(), e);
for (size_t depth = 0; depth < errors.size(); ++depth)
{
// for (size_t i = 0; i < depth; ++i)
// ss << " ";
const Err &e = *errors[depth];
ss << e.token->source_name << ":" << e.token->line << ":"
<< e.token->column << ": " << to_string(e.type);
if (depth != errors.size() - 1)
ss << "\n";
}
return ss.str();
}
std::ostream &operator<<(std::ostream &stream, const Unit &unit)
{
return stream << to_string(unit, 1);
}
std::ostream &operator<<(std::ostream &stream, const Err &err)
{
return stream << to_string(err);
}
Err::Err()
{
}
Err::Err(Err::Type type, Lexer::Token *root, Err *child, Lexer::Err err)
: token{root}, child_error{child}, lexer_error{err}, type{type}
{
}
Err::~Err(void)
{
if (this->child_error)
delete this->child_error;
}
} // namespace Preprocesser

80
src/preprocesser.hpp Normal file
View File

@@ -0,0 +1,80 @@
/* Copyright (C) 2024 Aryadev Chavali
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License Version 2 for
* details.
* You may distribute and modify this code under the terms of the GNU General
* Public License Version 2, which you should have received a copy of along with
* this program. If not, please go to <https://www.gnu.org/licenses/>.
* Created: 2024-07-03
* Author: Aryadev Chavali
* Description:
*/
#ifndef PREPROCESSER_HPP
#define PREPROCESSER_HPP
#include <ostream>
#include <unordered_map>
#include <src/lexer.hpp>
namespace Preprocesser
{
#define PREPROCESSER_MAX_DEPTH 16
struct Block
{
Lexer::Token *root;
std::vector<Lexer::Token *> body;
};
typedef std::unordered_map<std::string, Block> Map;
struct Unit
{
Lexer::Token *const root;
std::vector<Unit> expansion;
};
struct Err
{
Lexer::Token *token;
Err *child_error;
Lexer::Err lexer_error;
enum class Type
{
EXPECTED_END,
NO_CONST_AROUND,
EMPTY_CONST,
EXPECTED_SYMBOL_FOR_NAME,
DIRECTIVES_IN_CONST_BODY,
UNKNOWN_NAME_IN_REFERENCE,
EXPECTED_FILE_NAME_AS_STRING,
FILE_NON_EXISTENT,
IN_FILE_LEXING,
SELF_RECURSIVE_USE_CALL,
IN_ERROR,
EXCEEDED_PREPROCESSER_DEPTH,
} type;
Err();
Err(Err::Type, Lexer::Token *, Err *child = nullptr, Lexer::Err err = {});
~Err(void);
};
std::string to_string(const Unit &, int depth = 0);
std::string to_string(const Err::Type &);
std::string to_string(const Err &);
std::ostream &operator<<(std::ostream &, const Unit &);
std::ostream &operator<<(std::ostream &, const Err &);
Err *preprocess(std::vector<Lexer::Token *> tokens, std::vector<Unit> &units,
std::vector<Lexer::Token *> &new_token_bag, Map &const_map,
Map &file_map, int depth = 0);
}; // namespace Preprocesser
#endif