/* Copyright (C) 2024 Aryadev Chavali * You may distribute and modify this code under the terms of the * GPLv2 license. You should have received a copy of the GPLv2 * license with this file. If not, please write to: * aryadev@aryadevchavali.com. * Created: 2024-04-14 * Author: Aryadev Chavali * Description: Lexer for assembly language */ extern "C" { #include } #include #include #include "./lexer.hpp" static_assert(NUMBER_OF_OPCODES == 98, "ERROR: Lexer is out of date"); using std::string, std::string_view, std::pair, std::make_pair; const auto VALID_SYMBOL = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUV" "WXYZ0123456789-_.:%#$", VALID_DIGIT = "0123456789", VALID_HEX = "0123456789abcdefABCDEF"; bool is_char_in_s(char c, const char *s) { return string_view(s).find(c) != string::npos; } bool initial_match(string_view src, string_view match) { return (src.size() > match.size() && src.substr(0, match.size()) == match); } pair tokenise_symbol(string_view &source, size_t &column, size_t line) { auto end = source.find_first_not_of(VALID_SYMBOL); if (end == string::npos) end = source.size() - 1; string sym{source.substr(0, end)}; source.remove_prefix(end); std::transform(sym.begin(), sym.end(), sym.begin(), ::toupper); token_t t{}; if (sym == "%CONST") { t.type = token_type_t::PP_CONST; } else if (sym == "%USE") { t.type = token_type_t::PP_USE; } else if (sym == "%END") { t.type = token_type_t::PP_END; } else if (sym[0] == '%') { return make_pair( t, lerr_t(lerr_type_t::INVALID_PREPROCESSOR_DIRECTIVE, column, line)); } else if (sym.size() > 1 && sym[0] == '$') { t = token_t(token_type_t::PP_REFERENCE, sym.substr(1)); } else if (sym == "NOOP") { t.type = token_type_t::NOOP; } else if (sym == "HALT") { t.type = token_type_t::HALT; } else if (initial_match(sym, "PUSH.REG.")) { t = token_t(token_type_t::PUSH_REG, sym.substr(9)); } else if (initial_match(sym, "PUSH.")) { t = token_t(token_type_t::PUSH, sym.substr(5)); } else if (initial_match(sym, "POP.")) { t = token_t(token_type_t::POP, sym.substr(4)); } else if (initial_match(sym, "MOV.")) { t = token_t(token_type_t::MOV, sym.substr(4)); } else if (initial_match(sym, "DUP.")) { t = token_t(token_type_t::DUP, sym.substr(4)); } else if (initial_match(sym, "MALLOC.STACK.")) { t = token_t(token_type_t::MALLOC_STACK, sym.substr(13)); } else if (initial_match(sym, "MALLOC.")) { t = token_t(token_type_t::MALLOC, sym.substr(7)); } else if (initial_match(sym, "MSET.STACK.")) { t = token_t(token_type_t::MSET_STACK, sym.substr(11)); } else if (initial_match(sym, "MSET.")) { t = token_t(token_type_t::MSET, sym.substr(5)); } else if (initial_match(sym, "MGET.STACK.")) { t = token_t(token_type_t::MGET_STACK, sym.substr(11)); } else if (initial_match(sym, "MGET.")) { t = token_t(token_type_t::MGET, sym.substr(5)); } else if (sym == "MDELETE") { t.type = token_type_t::MDELETE; } else if (sym == "MSIZE") { t.type = token_type_t::MSIZE; } else if (initial_match(sym, "NOT.")) { t = token_t(token_type_t::NOT, sym.substr(4)); } else if (initial_match(sym, "OR.")) { t = token_t(token_type_t::OR, sym.substr(3)); } else if (initial_match(sym, "AND.")) { t = token_t(token_type_t::AND, sym.substr(4)); } else if (initial_match(sym, "XOR.")) { t = token_t(token_type_t::XOR, sym.substr(4)); } else if (initial_match(sym, "EQ.")) { t = token_t(token_type_t::EQ, sym.substr(3)); } else if (initial_match(sym, "LTE.")) { t = token_t(token_type_t::LTE, sym.substr(4)); } else if (initial_match(sym, "LT.")) { t = token_t(token_type_t::LT, sym.substr(3)); } else if (initial_match(sym, "GTE.")) { t = token_t(token_type_t::GTE, sym.substr(4)); } else if (initial_match(sym, "GT.")) { t = token_t(token_type_t::GT, sym.substr(3)); } else if (initial_match(sym, "SUB.")) { t = token_t(token_type_t::SUB, sym.substr(4)); } else if (initial_match(sym, "PLUS.")) { t = token_t(token_type_t::PLUS, sym.substr(5)); } else if (initial_match(sym, "MULT.")) { t = token_t(token_type_t::MULT, sym.substr(5)); } else if (initial_match(sym, "PRINT.")) { t = token_t(token_type_t::PRINT, sym.substr(6)); } else if (sym == "JUMP.ABS") { t.type = token_type_t::JUMP_ABS; } else if (sym == "JUMP.STACK") { t.type = token_type_t::JUMP_STACK; } else if (initial_match(sym, "JUMP.IF.")) { t = token_t(token_type_t::JUMP_IF, sym.substr(8)); } else if (sym == "CALL.STACK") { t.type = token_type_t::CALL_STACK; } else if (sym == "CALL") { t.type = token_type_t::CALL; } else if (sym == "RET") { t.type = token_type_t::RET; } else if (sym == "GLOBAL") { t.type = token_type_t::GLOBAL; } else { t.type = token_type_t::SYMBOL; } if (t.content == "") t.content = sym; t.column = column; column += sym.size() - 1; return make_pair(t, lerr_t()); } token_t tokenise_literal_number(string_view &source, size_t &column) { bool is_negative = false; if (source[0] == '-') { is_negative = true; source.remove_prefix(1); } auto end = source.find_first_not_of(VALID_DIGIT); if (end == string::npos) end = source.size() - 1; string digits{source.substr(0, end)}; source.remove_prefix(end); token_t t{token_type_t::LITERAL_NUMBER, (is_negative ? "-" : "") + digits, column}; column += digits.size() + (is_negative ? 1 : 0); return t; } token_t tokenise_literal_hex(string_view &source, size_t &column) { // Remove x char from source source.remove_prefix(1); auto end = source.find_first_not_of(VALID_HEX); if (end == string::npos) end = source.size() - 1; string digits{source.substr(0, end)}; source.remove_prefix(end); token_t t = {token_type_t::LITERAL_NUMBER, "0x" + digits, column}; column += digits.size() + 1; return t; } pair tokenise_literal_char(string_view &source, size_t &column, size_t &line) { token_t t{}; auto end = source.find('\'', 1); if (source.size() < 3 || end == 1 || end > 3) return make_pair(t, lerr_t(lerr_type_t::INVALID_CHAR_LITERAL, column, line)); else if (source[1] == '\\') { // Escape sequence char escape = '\0'; if (source.size() < 4 || source[3] != '\'') return make_pair(t, lerr_t(lerr_type_t::INVALID_CHAR_LITERAL_ESCAPE_SEQUENCE, column, line)); switch (source[2]) { case 'n': escape = '\n'; break; case 't': escape = '\t'; break; case 'r': escape = '\r'; break; case '\\': escape = '\\'; break; default: column += 2; return make_pair(t, lerr_t(lerr_type_t::INVALID_CHAR_LITERAL_ESCAPE_SEQUENCE, column, line)); break; } t = token_t{token_type_t::LITERAL_CHAR, std::to_string(escape), column}; column += 4; source.remove_prefix(4); } else { t = token_t(token_type_t::LITERAL_CHAR, std::to_string(source[1])); column += 3; source.remove_prefix(3); } return make_pair(t, lerr_t()); } token_t tokenise_literal_string(string_view &source, size_t &column, size_t end) { source.remove_prefix(1); token_t token{token_type_t::LITERAL_STRING, string(source.substr(0, end - 1)), column}; source.remove_prefix(end); column += end + 1; return token; } lerr_t tokenise_buffer(string_view source, std::vector &tokens) { size_t column = 0, line = 1; while (source.size() > 0) { bool is_token = true; char first = source[0]; token_t t{}; if (isspace(first) || first == '\0') { size_t i; for (i = 0; i < source.size() && (isspace(source[i]) || source[i] == '\0'); ++i) { ++column; if (source[i] == '\n') { column = 0; ++line; } } ++column; source.remove_prefix(i); is_token = false; } else if (first == ';') { size_t i; for (i = 0; i < source.size() && source[i] != '\n'; ++i) continue; column = 0; ++line; source.remove_prefix(i + 1); is_token = false; } else if (first == '*') { t = token_t(token_type_t::STAR, "", column); source.remove_prefix(1); } else if (first == '\"') { auto end = source.find('\"', 1); if (end == string::npos) return lerr_t(lerr_type_t::INVALID_STRING_LITERAL, column, line); t = tokenise_literal_string(source, column, end); } else if (first == '\'') { lerr_t lerr; std::tie(t, lerr) = tokenise_literal_char(source, column, line); if (lerr.type != lerr_type_t::OK) return lerr; } else if (isdigit(first) || (source.size() > 1 && first == '-' && isdigit(source[1]))) { auto end = source.find_first_not_of(VALID_DIGIT, first == '-' ? 1 : 0); if (end == string::npos) end = source.size() - 1; else if (end != string::npos && !(isspace(source[end]))) return lerr_t(lerr_type_t::INVALID_NUMBER_LITERAL, column, line); t = tokenise_literal_number(source, column); } else if (first == '0' && source.size() > 2 && source[1] == 'x' && is_char_in_s(source[2], VALID_HEX)) { auto end = source.find_first_not_of(VALID_HEX); if (end == string::npos) end = source.size() - 1; else if (end != string::npos && !(isspace(source[end]))) return lerr_t(lerr_type_t::INVALID_NUMBER_LITERAL, column, line); t = tokenise_literal_hex(source, column); } else if (is_char_in_s(first, VALID_SYMBOL)) { lerr_t lerr; std::tie(t, lerr) = tokenise_symbol(source, column, line); if (lerr.type != lerr_type_t::OK) return lerr; } else { ++column; return lerr_t{lerr_type_t::UNKNOWN_LEXEME, column, line}; } if (is_token) { t.line = line; token_t *acc = new token_t(t); tokens.push_back(acc); } } return lerr_t{}; } std::ostream &operator<<(std::ostream &os, token_t &t) { return os << token_type_as_cstr(t.type) << "(`" << t.content << "`)@" << t.line << ", " << t.column; } token_t::token_t() {} token_t::token_t(token_type_t type, string content, size_t col, size_t line) : type{type}, column{col}, line{line}, content{content} {} const char *token_type_as_cstr(token_type_t type) { switch (type) { case token_type_t::PP_USE: return "PP_USE"; case token_type_t::PP_CONST: return "PP_CONST"; case token_type_t::PP_END: return "PP_END"; case token_type_t::PP_REFERENCE: return "PP_REFERENCE"; case token_type_t::GLOBAL: return "GLOBAL"; case token_type_t::STAR: return "STAR"; case token_type_t::LITERAL_STRING: return "LITERAL_STRING"; case token_type_t::LITERAL_NUMBER: return "LITERAL_NUMBER"; case token_type_t::LITERAL_CHAR: return "LITERAL_CHAR"; case token_type_t::NOOP: return "NOOP"; case token_type_t::HALT: return "HALT"; case token_type_t::PUSH: return "PUSH"; case token_type_t::POP: return "POP"; case token_type_t::PUSH_REG: return "PUSH_REG"; case token_type_t::MOV: return "MOV"; case token_type_t::DUP: return "DUP"; case token_type_t::MALLOC: return "MALLOC"; case token_type_t::MALLOC_STACK: return "MALLOC_STACK"; case token_type_t::MSET: return "MSET"; case token_type_t::MSET_STACK: return "MSET_STACK"; case token_type_t::MGET: return "MGET"; case token_type_t::MGET_STACK: return "MGET_STACK"; case token_type_t::MDELETE: return "MDELETE"; case token_type_t::MSIZE: return "MSIZE"; case token_type_t::NOT: return "NOT"; case token_type_t::OR: return "OR"; case token_type_t::AND: return "AND"; case token_type_t::XOR: return "XOR"; case token_type_t::EQ: return "EQ"; case token_type_t::LT: return "LT"; case token_type_t::LTE: return "LTE"; case token_type_t::GT: return "GT"; case token_type_t::GTE: return "GTE"; case token_type_t::PLUS: return "PLUS"; case token_type_t::SUB: return "SUB"; case token_type_t::MULT: return "MULT"; case token_type_t::PRINT: return "PRINT"; case token_type_t::JUMP_ABS: return "JUMP_ABS"; case token_type_t::JUMP_STACK: return "JUMP_STACK"; case token_type_t::JUMP_IF: return "JUMP_IF"; case token_type_t::CALL: return "CALL"; case token_type_t::CALL_STACK: return "CALL_STACK"; case token_type_t::RET: return "RET"; case token_type_t::SYMBOL: return "SYMBOL"; } return ""; } std::ostream &operator<<(std::ostream &os, lerr_t &lerr) { os << lerr.line << ":" << lerr.col << ": "; switch (lerr.type) { case lerr_type_t::OK: os << "OK"; break; case lerr_type_t::INVALID_CHAR_LITERAL: os << "INVALID_CHAR_LITERAL"; break; case lerr_type_t::INVALID_CHAR_LITERAL_ESCAPE_SEQUENCE: os << "INVALID_CHAR_LITERAL_ESCAPE_SEQUENCE"; break; case lerr_type_t::INVALID_STRING_LITERAL: os << "INVALID_STRING_LITERAL"; break; case lerr_type_t::INVALID_NUMBER_LITERAL: os << "INVALID_NUMBER_LITERAL"; break; case lerr_type_t::INVALID_PREPROCESSOR_DIRECTIVE: os << "INVALID_PREPROCESSOR_DIRECTIVE"; break; case lerr_type_t::UNKNOWN_LEXEME: os << "UNKNOWN_LEXEME"; break; default: break; } return os; } lerr_t::lerr_t(lerr_type_t type, size_t col, size_t line) : col{col}, line{line}, type{type} {}