From a422c7d1dccc2f162d19cc76f439c096f9502ba8 Mon Sep 17 00:00:00 2001 From: Aryadev Chavali Date: Sat, 6 Jul 2024 17:38:02 +0100 Subject: [PATCH] A reworked preprocesser with focus on stopping recursive errors Preprocesser requires one function to use: preprocess. Takes Tokens and gives back Units. A unit is a tree of tokens, where each unit is a node in that tree. A unit has a "root" token (value of node) and an "expansion" (children of node) where the root is some preprocesser token (such as a reference or USE call) and the expansion is the tokens it yields. In the case of a USE call this is the tokens of the file it includes, in the case of a reference it's the tokens of the constant it refers to. This means that the leaves of the tree of units are the completely preprocessed/expanded form of the source code. The function has many working components, which may need to be extracted. In particular, the function ensures we don't include a source twice through a hash map and that constants are not redefined in inner include scopes if they're already defined in outer scopes (i.e. if compiling a.asm which defines constant N, then include b.asm which defines constant N, then N uses the definition of a.asm rather than b.asm). I need to make a spec for this. --- Makefile | 2 +- src/preprocesser.cpp | 273 +++++++++++++++++++++++++++++++++++++++++++ src/preprocesser.hpp | 80 +++++++++++++ 3 files changed, 354 insertions(+), 1 deletion(-) create mode 100644 src/preprocesser.cpp create mode 100644 src/preprocesser.hpp diff --git a/Makefile b/Makefile index 7578e4b..e95ea69 100644 --- a/Makefile +++ b/Makefile @@ -15,7 +15,7 @@ TERM_RESET:=$(shell echo -e "\e[0;0m") # Setup variables for source code, output, etc ## ASSEMBLY setup SRC=src -CODE:=$(addprefix $(SRC)/, base.cpp lexer.cpp) +CODE:=$(addprefix $(SRC)/, base.cpp lexer.cpp preprocesser.cpp) OBJECTS:=$(CODE:$(SRC)/%.cpp=$(DIST)/%.o) OUT=$(DIST)/asm.out diff --git a/src/preprocesser.cpp b/src/preprocesser.cpp new file mode 100644 index 0000000..1de1ba4 --- /dev/null +++ b/src/preprocesser.cpp @@ -0,0 +1,273 @@ +/* Copyright (C) 2024 Aryadev Chavali + + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License Version 2 for + * details. + + * You may distribute and modify this code under the terms of the GNU General + * Public License Version 2, which you should have received a copy of along with + * this program. If not, please go to . + + * Created: 2024-07-05 + * Author: Aryadev Chavali + * Description: + */ + +#include +#include +#include + +#include +#include + +namespace Preprocesser +{ + using TT = Lexer::Token::Type; + using ET = Err::Type; + using LET = Lexer::Err::Type; + + Err *preprocess(std::vector tokens, std::vector &units, + std::vector &new_token_bag, Map &const_map, + Map &file_map, int depth) + { + // Stop preprocessing if we've smashed the preprocessing call stack + if (depth >= PREPROCESSER_MAX_DEPTH) + return new Err{ET::EXCEEDED_PREPROCESSER_DEPTH, tokens[0]}; + + for (size_t i = 0; i < tokens.size(); ++i) + { + const auto token = tokens[i]; + if (token->type == TT::PP_CONST) + { + if (i == tokens.size() - 1 || tokens[i + 1]->type != TT::SYMBOL) + return new Err{ET::EXPECTED_SYMBOL_FOR_NAME, token}; + const auto const_name = tokens[i + 1]->content; + + size_t end = 0; + for (end = i + 2; + end < tokens.size() && tokens[end]->type != TT::PP_END; ++end) + { + // TODO: Is there a better way to deal with preprocesser calls inside + // of a constant? + if (tokens[end]->type == TT::PP_CONST || + tokens[end]->type == TT::PP_USE) + return new Err{ET::DIRECTIVES_IN_CONST_BODY, tokens[end]}; + } + + if (end == tokens.size()) + return new Err{ET::EXPECTED_END, token}; + else if (end - i == 2) + return new Err{ET::EMPTY_CONST, token}; + + // If this content is actually being included (depth > 0) by another + // file, check if the constant is already defined. If so, stop what + // we're doing and continue because user has technically redefined the + // constant. Implements a #ifndef guard automatically on included + // constants. + if (depth > 0 && const_map.find(const_name) != const_map.end()) + { + i = end; +#if VERBOSE >= 2 + std::cout << "[" TERM_YELLOW "PREPROCESSER" TERM_RESET "]: <" << depth + << "> [" << i << "]:\n\tPreserving definition of `" + << const_name << "` from outer scope\n"; +#endif + continue; + } + + std::vector body{end - i - 2}; + std::copy(std::begin(tokens) + i + 2, std::begin(tokens) + end, + std::begin(body)); + + const_map[const_name] = {token, body}; + i = end; + +#if VERBOSE >= 2 + std::cout << "[" TERM_YELLOW "PREPROCESSER" TERM_RESET "]: <" << depth + << "> [" << i << "]:\n\tConstant `" << const_name << "` {\n"; + + for (size_t j = 0; j < body.size(); ++j) + { + std::cout << "\t\t[" << j << "]: "; + if (body[j]) + std::cout << *body[j]; + else + std::cout << "[NULL]"; + std::cout << "\n"; + } + std::cout << "\t}\n"; +#endif + } + else if (token->type == TT::PP_USE) + { + // Ensure string in next token + if (i == tokens.size() - 1 || tokens[i + 1]->type != TT::LITERAL_STRING) + return new Err{ET::EXPECTED_FILE_NAME_AS_STRING, token}; + // Stops recursive calls on the file currently being preprocessed + if (file_map.find(token->source_name) == file_map.end()) + file_map[token->source_name] = {}; + + const auto name = tokens[i + 1]->content; +#if VERBOSE >= 2 + std::cout << "[" TERM_YELLOW "PREPROCESSER" TERM_RESET "]: <" << depth + << "> [" << i << "]: (" << *tokens[i] << "): FILENAME=`" + << name << "`\n"; +#endif + // If file has never been encountered, let's tokenise, preprocess then + // cache the result + if (file_map.find(name) == file_map.end()) + { + auto content = read_file(tokens[i + 1]->content.c_str()); + + if (!content.has_value()) + return new Err{ET::FILE_NON_EXISTENT, token}; + + std::vector body; + Lexer::Err lexer_err = Lexer::tokenise_buffer(tokens[i + 1]->content, + content.value(), body); + + if (lexer_err.type != LET::OK) + return new Err{ET::IN_FILE_LEXING, token, nullptr, lexer_err}; + + // Here we add the tokens, freshly allocated, to the bag so we can + // free it later + new_token_bag.insert(std::end(new_token_bag), std::begin(body), + std::end(body)); + + file_map[name].body = body; + std::vector body_units; + Err *err = preprocess(body, body_units, new_token_bag, const_map, + file_map, depth + 1); + // TODO: Introduce stack traces for this error (this error occurs in + // outside file that has use site in current file). + if (err) + return new Err{ET::IN_ERROR, token, err}; + units.push_back(Unit{token, body_units}); + ++i; + } + // Otherwise file must be part of the source tree already, so skip this + // call + else + i += 1; + } + else if (token->type == TT::PP_REFERENCE) + { + // Reference expansion based on latest constant + const auto found = const_map.find(token->content); + if (found == const_map.end()) + return new Err{ET::UNKNOWN_NAME_IN_REFERENCE, token}; + + std::vector preprocessed; + Err *err = preprocess(found->second.body, preprocessed, new_token_bag, + const_map, file_map, depth + 1); + if (err) + return new Err{ET::IN_ERROR, token, err}; + units.push_back(Unit{token, preprocessed}); + } + else if (token->type == TT::PP_END) + return new Err{ET::NO_CONST_AROUND, token}; + else + units.push_back(Unit{token, {}}); + } + return nullptr; + } + + std::string to_string(const Unit &unit, int depth) + { + std::stringstream ss; + for (int i = 0; i < depth; ++i) + ss << "\t"; + ss << Lexer::to_string(*unit.root) << " => {"; + if (unit.expansion.size() != 0) + { + ss << "\n"; + for (auto child : unit.expansion) + ss << to_string(child, depth + 1) << "\n"; + for (int i = 0; i < depth; ++i) + ss << "\t"; + } + ss << "}"; + return ss.str(); + } + + std::string to_string(const Err::Type &type) + { + switch (type) + { + case ET::EXPECTED_END: + return "EXPECTED_END"; + case ET::EMPTY_CONST: + return "EMPTY_CONST"; + case ET::NO_CONST_AROUND: + return "NO_CONST_AROUND"; + case ET::EXPECTED_SYMBOL_FOR_NAME: + return "EXPECTED_SYMBOL_FOR_NAME"; + case ET::DIRECTIVES_IN_CONST_BODY: + return "DIRECTIVES_IN_CONST_BODY"; + case ET::UNKNOWN_NAME_IN_REFERENCE: + return "UNKNOWN_NAME_IN_REFERENCE"; + case ET::EXPECTED_FILE_NAME_AS_STRING: + return "EXPECTED_FILE_NAME_AS_STRING"; + case ET::FILE_NON_EXISTENT: + return "FILE_NON_EXISTENT"; + case ET::IN_FILE_LEXING: + return "IN_FILE_LEXING"; + case ET::SELF_RECURSIVE_USE_CALL: + return "SELF_RECURSIVE_USE_CALL"; + case ET::IN_ERROR: + return "IN_ERROR"; + case ET::EXCEEDED_PREPROCESSER_DEPTH: + return "EXCEEDED_PREPROCESSER_DEPTH"; + default: + return ""; + } + } + + std::string to_string(const Err &err) + { + std::stringstream ss; + // Reverse traversal of err linked list + std::vector errors; + errors.push_back((Err *)&err); + for (Err *e = err.child_error; e; e = e->child_error) + errors.insert(errors.begin(), e); + for (size_t depth = 0; depth < errors.size(); ++depth) + { + // for (size_t i = 0; i < depth; ++i) + // ss << " "; + const Err &e = *errors[depth]; + ss << e.token->source_name << ":" << e.token->line << ":" + << e.token->column << ": " << to_string(e.type); + if (depth != errors.size() - 1) + ss << "\n"; + } + return ss.str(); + } + + std::ostream &operator<<(std::ostream &stream, const Unit &unit) + { + return stream << to_string(unit, 1); + } + + std::ostream &operator<<(std::ostream &stream, const Err &err) + { + return stream << to_string(err); + } + + Err::Err() + { + } + + Err::Err(Err::Type type, Lexer::Token *root, Err *child, Lexer::Err err) + : token{root}, child_error{child}, lexer_error{err}, type{type} + { + } + + Err::~Err(void) + { + if (this->child_error) + delete this->child_error; + } + +} // namespace Preprocesser diff --git a/src/preprocesser.hpp b/src/preprocesser.hpp new file mode 100644 index 0000000..3378428 --- /dev/null +++ b/src/preprocesser.hpp @@ -0,0 +1,80 @@ +/* Copyright (C) 2024 Aryadev Chavali + + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License Version 2 for + * details. + + * You may distribute and modify this code under the terms of the GNU General + * Public License Version 2, which you should have received a copy of along with + * this program. If not, please go to . + + * Created: 2024-07-03 + * Author: Aryadev Chavali + * Description: + */ + +#ifndef PREPROCESSER_HPP +#define PREPROCESSER_HPP + +#include +#include + +#include + +namespace Preprocesser +{ +#define PREPROCESSER_MAX_DEPTH 16 + struct Block + { + Lexer::Token *root; + std::vector body; + }; + + typedef std::unordered_map Map; + + struct Unit + { + Lexer::Token *const root; + std::vector expansion; + }; + + struct Err + { + Lexer::Token *token; + Err *child_error; + Lexer::Err lexer_error; + enum class Type + { + EXPECTED_END, + NO_CONST_AROUND, + EMPTY_CONST, + EXPECTED_SYMBOL_FOR_NAME, + DIRECTIVES_IN_CONST_BODY, + UNKNOWN_NAME_IN_REFERENCE, + + EXPECTED_FILE_NAME_AS_STRING, + FILE_NON_EXISTENT, + IN_FILE_LEXING, + SELF_RECURSIVE_USE_CALL, + + IN_ERROR, + EXCEEDED_PREPROCESSER_DEPTH, + } type; + + Err(); + Err(Err::Type, Lexer::Token *, Err *child = nullptr, Lexer::Err err = {}); + ~Err(void); + }; + + std::string to_string(const Unit &, int depth = 0); + std::string to_string(const Err::Type &); + std::string to_string(const Err &); + std::ostream &operator<<(std::ostream &, const Unit &); + std::ostream &operator<<(std::ostream &, const Err &); + + Err *preprocess(std::vector tokens, std::vector &units, + std::vector &new_token_bag, Map &const_map, + Map &file_map, int depth = 0); +}; // namespace Preprocesser +#endif