From 252c6e7b2d93d7c04bffeb4b41f4a2554829670e Mon Sep 17 00:00:00 2001 From: Daniel Kroening Date: Wed, 6 May 2026 17:00:57 -0700 Subject: [PATCH] SMT2 tokenizer now has full tokent class The SMT2 tokenizer now returns a full token class, including the token text and the line number of the token. This enables storing streams of SMT2 tokens. --- src/solvers/smt2/smt2_parser.cpp | 246 ++++++++++++++++----------- src/solvers/smt2/smt2_solver.cpp | 9 +- src/solvers/smt2/smt2_tokenizer.cpp | 111 +++++++----- src/solvers/smt2/smt2_tokenizer.h | 58 ++++--- src/solvers/smt2/smt2irep.cpp | 8 +- unit/Makefile | 1 + unit/solvers/smt2/smt2_tokenizer.cpp | 194 +++++++++++++++++++++ 7 files changed, 451 insertions(+), 176 deletions(-) create mode 100644 unit/solvers/smt2/smt2_tokenizer.cpp diff --git a/src/solvers/smt2/smt2_parser.cpp b/src/solvers/smt2/smt2_parser.cpp index f233905dd20..2274dd5c04e 100644 --- a/src/solvers/smt2/smt2_parser.cpp +++ b/src/solvers/smt2/smt2_parser.cpp @@ -56,13 +56,14 @@ void smt2_parsert::command_sequence() if(next_token() != smt2_tokenizert::OPEN) throw error("command must start with '('"); - if(next_token() != smt2_tokenizert::SYMBOL) + auto cmd_token = next_token(); + if(cmd_token != smt2_tokenizert::SYMBOL) { ignore_command(); throw error("expected symbol as command"); } - command(smt2_tokenizer.get_buffer()); + command(cmd_token.text); switch(next_token()) { @@ -156,10 +157,11 @@ exprt smt2_parsert::let_expression() { next_token(); - if(next_token() != smt2_tokenizert::SYMBOL) + auto binding_token = next_token(); + if(binding_token != smt2_tokenizert::SYMBOL) throw error("expected symbol in binding"); - irep_idt identifier = smt2_tokenizer.get_buffer(); + irep_idt identifier = binding_token.text; // note that the previous bindings are _not_ visible yet exprt value=expression(); @@ -226,10 +228,11 @@ std::pair smt2_parsert::binding(irep_idt id) { next_token(); - if(next_token() != smt2_tokenizert::SYMBOL) + auto binding_token = next_token(); + if(binding_token != smt2_tokenizert::SYMBOL) throw error("expected symbol in binding"); - irep_idt identifier = smt2_tokenizer.get_buffer(); + irep_idt identifier = binding_token.text; typet type=sort(); @@ -487,27 +490,29 @@ exprt smt2_parsert::function_application_fp(const exprt::operandst &op) exprt smt2_parsert::function_application() { - switch(next_token()) + auto token = next_token(); + + switch(token) { case smt2_tokenizert::SYMBOL: - if(smt2_tokenizer.get_buffer() == "_") // indexed identifier + if(token.text == "_") // indexed identifier { // indexed identifier - if(next_token() != smt2_tokenizert::SYMBOL) + auto id_token = next_token(); + if(id_token != smt2_tokenizert::SYMBOL) throw error("expected symbol after '_'"); - // copy, the reference won't be stable - const auto id = smt2_tokenizer.get_buffer(); + const auto id = id_token.text; if(has_prefix(id, "bv")) { - mp_integer i = string2integer( - std::string(smt2_tokenizer.get_buffer(), 2, std::string::npos)); + mp_integer i = string2integer(std::string(id, 2, std::string::npos)); - if(next_token() != smt2_tokenizert::NUMERAL) + auto width_token = next_token(); + if(width_token != smt2_tokenizert::NUMERAL) throw error("expected numeral as bitvector literal width"); - auto width = std::stoll(smt2_tokenizer.get_buffer()); + auto width = std::stoll(width_token.text); if(next_token() != smt2_tokenizert::CLOSE) throw error("expected ')' after bitvector literal"); @@ -518,15 +523,17 @@ exprt smt2_parsert::function_application() { // These are the "plus infinity", "minus infinity" and NaN // floating-point literals. - if(next_token() != smt2_tokenizert::NUMERAL) + auto e_token = next_token(); + if(e_token != smt2_tokenizert::NUMERAL) throw error() << "expected number after " << id; - auto width_e = std::stoll(smt2_tokenizer.get_buffer()); + auto width_e = std::stoll(e_token.text); - if(next_token() != smt2_tokenizert::NUMERAL) + auto f_token = next_token(); + if(f_token != smt2_tokenizert::NUMERAL) throw error() << "expected second number after " << id; - auto width_f = std::stoll(smt2_tokenizer.get_buffer()); + auto width_f = std::stoll(f_token.text); if(next_token() != smt2_tokenizert::CLOSE) throw error() << "expected ')' after " << id; @@ -546,24 +553,24 @@ exprt smt2_parsert::function_application() throw error() << "unknown indexed identifier " << id; } } - else if(smt2_tokenizer.get_buffer() == "!") + else if(token.text == "!") { // these are "term attributes" const auto term = expression(); while(smt2_tokenizer.peek() == smt2_tokenizert::KEYWORD) { - next_token(); // eat the keyword - if(smt2_tokenizer.get_buffer() == "named") + auto kw_token = next_token(); // eat the keyword + if(kw_token.text == "named") { // 'named terms' must be Boolean if(!term.is_boolean()) throw error("named terms must be Boolean"); - if(next_token() == smt2_tokenizert::SYMBOL) + auto name_token = next_token(); + if(name_token == smt2_tokenizert::SYMBOL) { - const symbol_exprt symbol_expr( - smt2_tokenizer.get_buffer(), bool_typet()); + const symbol_exprt symbol_expr(name_token.text, bool_typet()); named_terms.emplace( symbol_expr.identifier(), named_termt(term, symbol_expr)); } @@ -582,7 +589,7 @@ exprt smt2_parsert::function_application() else { // non-indexed symbol, look up in expression table - const auto id = smt2_tokenizer.get_buffer(); + const auto &id = token.text; const auto e_it = expressions.find(id); if(e_it != expressions.end()) return e_it->second(); @@ -609,26 +616,29 @@ exprt smt2_parsert::function_application() case smt2_tokenizert::OPEN: // likely indexed identifier if(smt2_tokenizer.peek() == smt2_tokenizert::SYMBOL) { - next_token(); // eat symbol - if(smt2_tokenizer.get_buffer() == "_") + auto sym_token = next_token(); // eat symbol + if(sym_token.text == "_") { // indexed identifier - if(next_token() != smt2_tokenizert::SYMBOL) + auto id_token = next_token(); + if(id_token != smt2_tokenizert::SYMBOL) throw error("expected symbol after '_'"); - irep_idt id = smt2_tokenizer.get_buffer(); // hash it + irep_idt id = id_token.text; // hash it if(id=="extract") { - if(next_token() != smt2_tokenizert::NUMERAL) + auto upper_token = next_token(); + if(upper_token != smt2_tokenizert::NUMERAL) throw error("expected numeral after extract"); - auto upper = std::stoll(smt2_tokenizer.get_buffer()); + auto upper = std::stoll(upper_token.text); - if(next_token() != smt2_tokenizert::NUMERAL) + auto lower_token = next_token(); + if(lower_token != smt2_tokenizert::NUMERAL) throw error("expected two numerals after extract"); - auto lower = std::stoll(smt2_tokenizer.get_buffer()); + auto lower = std::stoll(lower_token.text); if(next_token() != smt2_tokenizert::CLOSE) throw error("expected ')' after extract"); @@ -653,10 +663,11 @@ exprt smt2_parsert::function_application() id=="sign_extend" || id=="zero_extend") { - if(next_token() != smt2_tokenizert::NUMERAL) + auto index_token = next_token(); + if(index_token != smt2_tokenizert::NUMERAL) throw error() << "expected numeral after " << id; - auto index = string2integer(smt2_tokenizer.get_buffer()); + auto index = string2integer(index_token.text); if(next_token() != smt2_tokenizert::CLOSE) throw error() << "expected ')' after " << id << " index"; @@ -708,15 +719,17 @@ exprt smt2_parsert::function_application() } else if(id == "to_fp") { - if(next_token() != smt2_tokenizert::NUMERAL) + auto e_token = next_token(); + if(e_token != smt2_tokenizert::NUMERAL) throw error("expected number after to_fp"); - auto width_e = std::stoll(smt2_tokenizer.get_buffer()); + auto width_e = std::stoll(e_token.text); - if(next_token() != smt2_tokenizert::NUMERAL) + auto f_token = next_token(); + if(f_token != smt2_tokenizert::NUMERAL) throw error("expected second number after to_fp"); - auto width_f = std::stoll(smt2_tokenizer.get_buffer()); + auto width_f = std::stoll(f_token.text); if(next_token() != smt2_tokenizert::CLOSE) throw error("expected ')' after to_fp"); @@ -801,15 +814,17 @@ exprt smt2_parsert::function_application() } else if(id == "to_fp_unsigned") { - if(next_token() != smt2_tokenizert::NUMERAL) + auto e_token = next_token(); + if(e_token != smt2_tokenizert::NUMERAL) throw error("expected number after to_fp_unsigned"); - auto width_e = std::stoll(smt2_tokenizer.get_buffer()); + auto width_e = std::stoll(e_token.text); - if(next_token() != smt2_tokenizert::NUMERAL) + auto f_token = next_token(); + if(f_token != smt2_tokenizert::NUMERAL) throw error("expected second number after to_fp_unsigned"); - auto width_f = std::stoll(smt2_tokenizer.get_buffer()); + auto width_f = std::stoll(f_token.text); if(next_token() != smt2_tokenizert::CLOSE) throw error("expected ')' after to_fp_unsigned"); @@ -838,10 +853,11 @@ exprt smt2_parsert::function_application() else if(id == "fp.to_sbv" || id == "fp.to_ubv") { // These are indexed by the number of bits of the result. - if(next_token() != smt2_tokenizert::NUMERAL) + auto width_token = next_token(); + if(width_token != smt2_tokenizert::NUMERAL) throw error() << "expected number after " << id; - auto width = std::stoll(smt2_tokenizer.get_buffer()); + auto width = std::stoll(width_token.text); if(next_token() != smt2_tokenizert::CLOSE) throw error() << "expected ')' after " << id; @@ -864,16 +880,15 @@ exprt smt2_parsert::function_application() } else { - throw error() << "unknown indexed identifier '" - << smt2_tokenizer.get_buffer() << '\''; + throw error() << "unknown indexed identifier '" << id << '\''; } } - else if(smt2_tokenizer.get_buffer() == "as") + else if(sym_token.text == "as") { // This is an extension understood by Z3 and CVC4. if( smt2_tokenizer.peek() == smt2_tokenizert::SYMBOL && - smt2_tokenizer.get_buffer() == "const") + smt2_tokenizer.peek().text == "const") { next_token(); // eat the "const" auto sort = this->sort(); @@ -886,7 +901,7 @@ exprt smt2_parsert::function_application() const auto &array_sort = to_array_type(sort); - if(smt2_tokenizer.next_token() != smt2_tokenizert::CLOSE) + if(next_token() != smt2_tokenizert::CLOSE) throw error() << "expecting ')' after sort in 'as const'"; auto value = expression(); @@ -894,7 +909,7 @@ exprt smt2_parsert::function_application() if(value.type() != array_sort.element_type()) throw error() << "unexpected 'as const' with wrong element type"; - if(smt2_tokenizer.next_token() != smt2_tokenizert::CLOSE) + if(next_token() != smt2_tokenizert::CLOSE) throw error() << "expecting ')' at the end of 'as const'"; return array_of_exprt(value, array_sort); @@ -1010,34 +1025,36 @@ exprt smt2_parsert::bv_mod(const exprt::operandst &operands, bool is_signed) exprt smt2_parsert::expression() { - switch(next_token()) + auto token = next_token(); + + switch(token) { case smt2_tokenizert::SYMBOL: - { - const auto &identifier = smt2_tokenizer.get_buffer(); - - // in the expression table? - const auto e_it = expressions.find(identifier); - if(e_it != expressions.end()) - return e_it->second(); + { + const auto &identifier = token.text; - // rummage through id_map - auto id_it = id_map.find(identifier); - if(id_it != id_map.end()) - { - symbol_exprt symbol_expr(identifier, id_it->second.type); - if(smt2_tokenizer.token_is_quoted_symbol()) - symbol_expr.set(ID_C_quoted, true); - return std::move(symbol_expr); - } + // in the expression table? + const auto e_it = expressions.find(identifier); + if(e_it != expressions.end()) + return e_it->second(); - // don't know, give up - throw error() << "unknown expression '" << identifier << '\''; + // rummage through id_map + auto id_it = id_map.find(identifier); + if(id_it != id_map.end()) + { + symbol_exprt symbol_expr(identifier, id_it->second.type); + if(token.quoted_symbol) + symbol_expr.set(ID_C_quoted, true); + return std::move(symbol_expr); } + // don't know, give up + throw error() << "unknown expression '" << identifier << '\''; + } + case smt2_tokenizert::NUMERAL: { - const std::string &buffer = smt2_tokenizer.get_buffer(); + const std::string &buffer = token.text; if(buffer.size() >= 2 && buffer[0] == '#' && buffer[1] == 'x') { mp_integer value = @@ -1491,36 +1508,45 @@ typet smt2_parsert::sort() // SYMBOL // ( _ SYMBOL ... // ( SYMBOL ... - switch(next_token()) + auto sort_token = next_token(); + + switch(sort_token) { case smt2_tokenizert::SYMBOL: break; case smt2_tokenizert::OPEN: - if(smt2_tokenizer.next_token() != smt2_tokenizert::SYMBOL) + { + auto inner_token = next_token(); + if(inner_token != smt2_tokenizert::SYMBOL) throw error("expected symbol after '(' in a sort "); - if(smt2_tokenizer.get_buffer() == "_") + if(inner_token.text == "_") { - if(next_token() != smt2_tokenizert::SYMBOL) + sort_token = next_token(); + if(sort_token != smt2_tokenizert::SYMBOL) throw error("expected symbol after '_' in a sort"); } + else + { + sort_token = std::move(inner_token); + } break; + } case smt2_tokenizert::CLOSE: case smt2_tokenizert::NUMERAL: case smt2_tokenizert::STRING_LITERAL: case smt2_tokenizert::NONE: case smt2_tokenizert::KEYWORD: - throw error() << "unexpected token in a sort: '" - << smt2_tokenizer.get_buffer() << '\''; + throw error() << "unexpected token in a sort: '" << sort_token.text << '\''; case smt2_tokenizert::END_OF_FILE: throw error() << "unexpected end-of-file in a sort"; } // now we have a SYMBOL - const auto &token = smt2_tokenizer.get_buffer(); + const auto &token = sort_token.text; const auto s_it = sorts.find(token); @@ -1549,11 +1575,13 @@ void smt2_parsert::setup_sorts() return ieee_float_spect::quadruple_precision().to_type(); }; - sorts["BitVec"] = [this] { - if(next_token() != smt2_tokenizert::NUMERAL) + sorts["BitVec"] = [this] + { + auto width_token = next_token(); + if(width_token != smt2_tokenizert::NUMERAL) throw error("expected numeral as bit-width"); - auto width = std::stoll(smt2_tokenizer.get_buffer()); + auto width = std::stoll(width_token.text); // eat the ')' if(next_token() != smt2_tokenizert::CLOSE) @@ -1562,16 +1590,19 @@ void smt2_parsert::setup_sorts() return unsignedbv_typet(width); }; - sorts["FloatingPoint"] = [this] { - if(next_token() != smt2_tokenizert::NUMERAL) + sorts["FloatingPoint"] = [this] + { + auto e_token = next_token(); + if(e_token != smt2_tokenizert::NUMERAL) throw error("expected numeral as bit-width"); - const auto width_e = std::stoll(smt2_tokenizer.get_buffer()); + const auto width_e = std::stoll(e_token.text); - if(next_token() != smt2_tokenizert::NUMERAL) + auto f_token = next_token(); + if(f_token != smt2_tokenizert::NUMERAL) throw error("expected numeral as bit-width"); - const auto width_f = std::stoll(smt2_tokenizer.get_buffer()); + const auto width_f = std::stoll(f_token.text); // consume the ')' if(next_token() != smt2_tokenizert::CLOSE) @@ -1621,10 +1652,11 @@ smt2_parsert::function_signature_definition() if(next_token() != smt2_tokenizert::OPEN) throw error("expected '(' at beginning of parameter"); - if(next_token() != smt2_tokenizert::SYMBOL) + auto param_token = next_token(); + if(param_token != smt2_tokenizert::SYMBOL) throw error("expected symbol in parameter"); - irep_idt id = smt2_tokenizer.get_buffer(); + irep_idt id = param_token.text; domain.push_back(sort()); parameters.push_back(id); @@ -1677,13 +1709,13 @@ void smt2_parsert::command(const std::string &c) void smt2_parsert::setup_commands() { - commands["declare-const"] = [this]() { - const auto s = smt2_tokenizer.get_buffer(); - - if(next_token() != smt2_tokenizert::SYMBOL) - throw error() << "expected a symbol after " << s; + commands["declare-const"] = [this]() + { + auto id_token = next_token(); + if(id_token != smt2_tokenizert::SYMBOL) + throw error("expected a symbol after declare-const"); - irep_idt id = smt2_tokenizer.get_buffer(); + irep_idt id = id_token.text; auto type = sort(); add_unique_id(id, exprt(ID_nil, type)); @@ -1693,21 +1725,25 @@ void smt2_parsert::setup_commands() // accepted by Z3 and CVC4 commands["declare-var"] = commands["declare-const"]; - commands["declare-fun"] = [this]() { - if(next_token() != smt2_tokenizert::SYMBOL) + commands["declare-fun"] = [this]() + { + auto id_token = next_token(); + if(id_token != smt2_tokenizert::SYMBOL) throw error("expected a symbol after declare-fun"); - irep_idt id = smt2_tokenizer.get_buffer(); + irep_idt id = id_token.text; auto type = function_signature_declaration(); add_unique_id(id, exprt(ID_nil, type)); }; - commands["define-const"] = [this]() { - if(next_token() != smt2_tokenizert::SYMBOL) + commands["define-const"] = [this]() + { + auto id_token = next_token(); + if(id_token != smt2_tokenizert::SYMBOL) throw error("expected a symbol after define-const"); - const irep_idt id = smt2_tokenizer.get_buffer(); + const irep_idt id = id_token.text; const auto type = sort(); const auto value = expression(); @@ -1724,11 +1760,13 @@ void smt2_parsert::setup_commands() add_unique_id(id, value); }; - commands["define-fun"] = [this]() { - if(next_token() != smt2_tokenizert::SYMBOL) + commands["define-fun"] = [this]() + { + auto id_token = next_token(); + if(id_token != smt2_tokenizert::SYMBOL) throw error("expected a symbol after define-fun"); - const irep_idt id = smt2_tokenizer.get_buffer(); + const irep_idt id = id_token.text; const auto signature = function_signature_definition(); diff --git a/src/solvers/smt2/smt2_solver.cpp b/src/solvers/smt2/smt2_solver.cpp index 9d0652f7989..29aae16e2a6 100644 --- a/src/solvers/smt2/smt2_solver.cpp +++ b/src/solvers/smt2/smt2_solver.cpp @@ -260,12 +260,13 @@ void smt2_solvert::setup_commands() std::cout << ")\n"; }; - commands["echo"] = [this]() { - if(next_token() != smt2_tokenizert::STRING_LITERAL) + commands["echo"] = [this]() + { + auto str_token = next_token(); + if(str_token != smt2_tokenizert::STRING_LITERAL) throw error("expected string literal"); - std::cout << smt2_format(constant_exprt( - smt2_tokenizer.get_buffer(), string_typet())) + std::cout << smt2_format(constant_exprt(str_token.text, string_typet())) << '\n'; }; diff --git a/src/solvers/smt2/smt2_tokenizer.cpp b/src/solvers/smt2/smt2_tokenizer.cpp index 6272bb8d5a8..7c02269714a 100644 --- a/src/solvers/smt2/smt2_tokenizer.cpp +++ b/src/solvers/smt2/smt2_tokenizer.cpp @@ -27,30 +27,35 @@ smt2_tokenizert::tokent smt2_tokenizert::get_simple_symbol() // ~ ! @ $ % ^ & * _ - + = < > . ? / // that does not start with a digit and is not a reserved word. - buffer.clear(); + token.text.clear(); char ch; while(in->get(ch)) { if(is_smt2_simple_symbol_character(ch)) { - buffer+=ch; + token.text += ch; } else { in->unget(); // put back - quoted_symbol = false; - return SYMBOL; + token.quoted_symbol = false; + token.kind = SYMBOL; + return token; } } // eof -- this is ok here - if(buffer.empty()) - return END_OF_FILE; + if(token.text.empty()) + { + token.kind = END_OF_FILE; + return token; + } else { - quoted_symbol = false; - return SYMBOL; + token.quoted_symbol = false; + token.kind = SYMBOL; + return token; } } @@ -58,85 +63,94 @@ smt2_tokenizert::tokent smt2_tokenizert::get_decimal_numeral() { // we accept any sequence of digits and dots - buffer.clear(); + token.text.clear(); char ch; while(in->get(ch)) { if(isdigit(ch) || ch=='.') { - buffer+=ch; + token.text += ch; } else { in->unget(); // put back - return NUMERAL; + token.kind = NUMERAL; + return token; } } // eof -- this is ok here - if(buffer.empty()) - return END_OF_FILE; + if(token.text.empty()) + token.kind = END_OF_FILE; else - return NUMERAL; + token.kind = NUMERAL; + + return token; } smt2_tokenizert::tokent smt2_tokenizert::get_bin_numeral() { // we accept any sequence of '0' or '1' - buffer.clear(); - buffer+='#'; - buffer+='b'; + token.text.clear(); + token.text += '#'; + token.text += 'b'; char ch; while(in->get(ch)) { if(ch=='0' || ch=='1') { - buffer+=ch; + token.text += ch; } else { in->unget(); // put back - return NUMERAL; + token.kind = NUMERAL; + return token; } } // eof -- this is ok here - if(buffer.empty()) - return END_OF_FILE; + if(token.text.empty()) + token.kind = END_OF_FILE; else - return NUMERAL; + token.kind = NUMERAL; + + return token; } smt2_tokenizert::tokent smt2_tokenizert::get_hex_numeral() { // we accept any sequence of '0'-'9', 'a'-'f', 'A'-'F' - buffer.clear(); - buffer+='#'; - buffer+='x'; + token.text.clear(); + token.text += '#'; + token.text += 'x'; char ch; while(in->get(ch)) { if(isxdigit(ch)) { - buffer+=ch; + token.text += ch; } else { in->unget(); // put back - return NUMERAL; + token.kind = NUMERAL; + return token; } } // eof -- this is ok here - if(buffer.empty()) - return END_OF_FILE; + if(token.text.empty()) + token.kind = END_OF_FILE; else - return NUMERAL; + token.kind = NUMERAL; + + return token; } smt2_tokenizert::tokent smt2_tokenizert::get_quoted_symbol() @@ -146,21 +160,22 @@ smt2_tokenizert::tokent smt2_tokenizert::get_quoted_symbol() // character \, that starts and ends with | and does not otherwise // contain | - buffer.clear(); + token.text.clear(); char ch; while(in->get(ch)) { if(ch=='|') { - quoted_symbol = true; - return SYMBOL; // done + token.quoted_symbol = true; + token.kind = SYMBOL; // done + return token; } - buffer+=ch; + token.text += ch; if(ch=='\n') - line_no++; + token.line_no++; } // Hmpf. Eof before end of quoted symbol. This is an error. @@ -169,7 +184,7 @@ smt2_tokenizert::tokent smt2_tokenizert::get_quoted_symbol() smt2_tokenizert::tokent smt2_tokenizert::get_string_literal() { - buffer.clear(); + token.text.clear(); char ch; while(in->get(ch)) @@ -185,13 +200,17 @@ smt2_tokenizert::tokent smt2_tokenizert::get_string_literal() else { in->unget(); - return STRING_LITERAL; // done + token.kind = STRING_LITERAL; + return token; // done } } else - return STRING_LITERAL; // done + { + token.kind = STRING_LITERAL; + return token; // done + } } - buffer+=ch; + token.text += ch; } // Hmpf. Eof before end of string literal. This is an error. @@ -205,7 +224,7 @@ smt2_tokenizert::tokent smt2_tokenizert::next_token() else get_token_from_stream(); - return token; + return token.move_me(); } void smt2_tokenizert::get_token_from_stream() @@ -217,7 +236,7 @@ void smt2_tokenizert::get_token_from_stream() switch(ch) { case '\n': - line_no++; + token.line_no++; break; case ' ': @@ -233,7 +252,7 @@ void smt2_tokenizert::get_token_from_stream() { if(ch=='\n') { - line_no++; + token.line_no++; break; } } @@ -241,12 +260,12 @@ void smt2_tokenizert::get_token_from_stream() case '(': // produce sub-expression - token = OPEN; + token.kind = OPEN; return; case ')': // done with sub-expression - token = CLOSE; + token.kind = CLOSE; return; case '|': // quoted symbol @@ -261,7 +280,7 @@ void smt2_tokenizert::get_token_from_stream() token = get_simple_symbol(); if(token == SYMBOL) { - token = KEYWORD; + token.kind = KEYWORD; return; } else @@ -308,5 +327,5 @@ void smt2_tokenizert::get_token_from_stream() } } - token = END_OF_FILE; + token.kind = END_OF_FILE; } diff --git a/src/solvers/smt2/smt2_tokenizer.h b/src/solvers/smt2/smt2_tokenizer.h index a95fdcfb361..5790e2d3708 100644 --- a/src/solvers/smt2/smt2_tokenizer.h +++ b/src/solvers/smt2/smt2_tokenizer.h @@ -15,10 +15,10 @@ Author: Daniel Kroening, kroening@kroening.com class smt2_tokenizert { public: - explicit smt2_tokenizert(std::istream &_in) : peeked(false), token(NONE) + explicit smt2_tokenizert(std::istream &_in) : peeked(false) { in=&_in; - line_no=1; + token.line_no = 1; } class smt2_errort @@ -63,7 +63,7 @@ class smt2_tokenizert unsigned line_no; }; - using tokent = enum { + using token_kindt = enum { NONE, END_OF_FILE, STRING_LITERAL, @@ -74,9 +74,42 @@ class smt2_tokenizert CLOSE }; + class tokent + { + public: + tokent() = default; + explicit tokent(token_kindt _kind) : kind(_kind) + { + } + + token_kindt kind = NONE; + std::string text; + unsigned line_no = 0; + bool quoted_symbol = false; + + // conversion + operator token_kindt() const + { + return kind; + } + + // Return move-copy of this token, but preserve + // the line number. + tokent move_me() + { + tokent result; + result.kind = kind; + result.line_no = line_no; + result.text = std::move(text); + result.quoted_symbol = quoted_symbol; + kind = NONE; // the token is now destroyed + return result; + } + }; + tokent next_token(); - tokent peek() + const tokent &peek() { if(peeked) return token; @@ -88,33 +121,20 @@ class smt2_tokenizert } } - const std::string &get_buffer() const - { - return buffer; - } - - bool token_is_quoted_symbol() const - { - return quoted_symbol; - } - /// generate an error exception, pre-filled with a message smt2_errort error(const std::string &message) const { - return smt2_errort(message, line_no); + return smt2_errort(message, token.line_no); } /// generate an error exception smt2_errort error() const { - return smt2_errort(line_no); + return smt2_errort(token.line_no); } protected: std::istream *in; - unsigned line_no; - std::string buffer; - bool quoted_symbol = false; bool peeked; tokent token; diff --git a/src/solvers/smt2/smt2irep.cpp b/src/solvers/smt2/smt2irep.cpp index 9eeeb58921a..d8a5d6ec8c5 100644 --- a/src/solvers/smt2/smt2irep.cpp +++ b/src/solvers/smt2/smt2irep.cpp @@ -36,7 +36,9 @@ std::optional smt2irept::operator()() while(true) { - switch(next_token()) + auto token = next_token(); + + switch(token) { case END_OF_FILE: if(stack.empty()) @@ -48,9 +50,9 @@ std::optional smt2irept::operator()() case NUMERAL: case SYMBOL: if(stack.empty()) - return irept(buffer); // all done! + return irept{token.text}; // all done! else - stack.top().get_sub().push_back(irept(buffer)); + stack.top().get_sub().push_back(irept{token.text}); break; case OPEN: // '(' diff --git a/unit/Makefile b/unit/Makefile index fa46a11d30c..d26c3cfc10e 100644 --- a/unit/Makefile +++ b/unit/Makefile @@ -112,6 +112,7 @@ SRC += analyses/ai/ai.cpp \ solvers/sat/satcheck_minisat2.cpp \ solvers/smt2/smt2_conv.cpp \ solvers/smt2/smt2irep.cpp \ + solvers/smt2/smt2_tokenizer.cpp \ solvers/smt2_incremental/ast/smt_commands.cpp \ solvers/smt2_incremental/ast/smt_index.cpp \ solvers/smt2_incremental/ast/smt_responses.cpp \ diff --git a/unit/solvers/smt2/smt2_tokenizer.cpp b/unit/solvers/smt2/smt2_tokenizer.cpp new file mode 100644 index 00000000000..d0635722802 --- /dev/null +++ b/unit/solvers/smt2/smt2_tokenizer.cpp @@ -0,0 +1,194 @@ +// Author: Daniel Kroening + +/// \file +/// Unit tests for smt2_tokenizert + +#include +#include + +#include + +TEST_CASE("smt2 tokenizer end of file", "[core][solvers][smt2]") +{ + std::istringstream in{""}; + smt2_tokenizert tokenizer{in}; + auto tok = tokenizer.next_token(); + REQUIRE(tok.kind == smt2_tokenizert::END_OF_FILE); +} + +TEST_CASE("smt2 tokenizer open/close parentheses", "[core][solvers][smt2]") +{ + std::istringstream in{"()"}; + smt2_tokenizert tokenizer{in}; + auto tok1 = tokenizer.next_token(); + REQUIRE(tok1.kind == smt2_tokenizert::OPEN); + auto tok2 = tokenizer.next_token(); + REQUIRE(tok2.kind == smt2_tokenizert::CLOSE); + auto tok3 = tokenizer.next_token(); + REQUIRE(tok3.kind == smt2_tokenizert::END_OF_FILE); +} + +TEST_CASE("smt2 tokenizer simple symbol", "[core][solvers][smt2]") +{ + std::istringstream in{"abc"}; + smt2_tokenizert tokenizer{in}; + auto tok = tokenizer.next_token(); + REQUIRE(tok.kind == smt2_tokenizert::SYMBOL); + REQUIRE(tok.text == "abc"); + REQUIRE_FALSE(tok.quoted_symbol); +} + +TEST_CASE("smt2 tokenizer quoted symbol", "[core][solvers][smt2]") +{ + std::istringstream in{"|hello world|"}; + smt2_tokenizert tokenizer{in}; + auto tok = tokenizer.next_token(); + REQUIRE(tok.kind == smt2_tokenizert::SYMBOL); + REQUIRE(tok.text == "hello world"); + REQUIRE(tok.quoted_symbol); +} + +TEST_CASE("smt2 tokenizer string literal", "[core][solvers][smt2]") +{ + std::istringstream in{"\"hello\""}; + smt2_tokenizert tokenizer{in}; + auto tok = tokenizer.next_token(); + REQUIRE(tok.kind == smt2_tokenizert::STRING_LITERAL); + REQUIRE(tok.text == "hello"); +} + +TEST_CASE("smt2 tokenizer escaped quotes in string", "[core][solvers][smt2]") +{ + std::istringstream in{"\"say \"\"hi\"\"\""}; + smt2_tokenizert tokenizer{in}; + auto tok = tokenizer.next_token(); + REQUIRE(tok.kind == smt2_tokenizert::STRING_LITERAL); + REQUIRE(tok.text == "say \"hi\""); +} + +TEST_CASE("smt2 tokenizer decimal numeral", "[core][solvers][smt2]") +{ + std::istringstream in{"42"}; + smt2_tokenizert tokenizer{in}; + auto tok = tokenizer.next_token(); + REQUIRE(tok.kind == smt2_tokenizert::NUMERAL); + REQUIRE(tok.text == "42"); +} + +TEST_CASE("smt2 tokenizer binary numeral", "[core][solvers][smt2]") +{ + std::istringstream in{"#b1010"}; + smt2_tokenizert tokenizer{in}; + auto tok = tokenizer.next_token(); + REQUIRE(tok.kind == smt2_tokenizert::NUMERAL); + REQUIRE(tok.text == "#b1010"); +} + +TEST_CASE("smt2 tokenizer hex numeral", "[core][solvers][smt2]") +{ + std::istringstream in{"#xFF"}; + smt2_tokenizert tokenizer{in}; + auto tok = tokenizer.next_token(); + REQUIRE(tok.kind == smt2_tokenizert::NUMERAL); + REQUIRE(tok.text == "#xFF"); +} + +TEST_CASE("smt2 tokenizer keyword", "[core][solvers][smt2]") +{ + std::istringstream in{":status"}; + smt2_tokenizert tokenizer{in}; + auto tok = tokenizer.next_token(); + REQUIRE(tok.kind == smt2_tokenizert::KEYWORD); + REQUIRE(tok.text == "status"); +} + +TEST_CASE("smt2 tokenizer skips whitespace", "[core][solvers][smt2]") +{ + std::istringstream in{" \t\r\n abc"}; + smt2_tokenizert tokenizer{in}; + auto tok = tokenizer.next_token(); + REQUIRE(tok.kind == smt2_tokenizert::SYMBOL); + REQUIRE(tok.text == "abc"); +} + +TEST_CASE("smt2 tokenizer skips comments", "[core][solvers][smt2]") +{ + std::istringstream in{"; this is a comment\nabc"}; + smt2_tokenizert tokenizer{in}; + auto tok = tokenizer.next_token(); + REQUIRE(tok.kind == smt2_tokenizert::SYMBOL); + REQUIRE(tok.text == "abc"); +} + +TEST_CASE("smt2 tokenizer peek", "[core][solvers][smt2]") +{ + std::istringstream in{"abc def"}; + smt2_tokenizert tokenizer{in}; + const auto &peeked = tokenizer.peek(); + REQUIRE(peeked.kind == smt2_tokenizert::SYMBOL); + REQUIRE(peeked.text == "abc"); + // peek again returns same token + const auto &peeked2 = tokenizer.peek(); + REQUIRE(peeked2.text == "abc"); + // next_token consumes the peeked token + auto tok = tokenizer.next_token(); + REQUIRE(tok.text == "abc"); + auto tok2 = tokenizer.next_token(); + REQUIRE(tok2.text == "def"); +} + +TEST_CASE("smt2 tokenizer line number tracking", "[core][solvers][smt2]") +{ + std::istringstream in{"abc\ndef"}; + smt2_tokenizert tokenizer{in}; + auto tok1 = tokenizer.next_token(); + REQUIRE(tok1.line_no == 1); + auto tok2 = tokenizer.next_token(); + REQUIRE(tok2.line_no == 2); +} + +TEST_CASE( + "smt2 tokenizer error on EOF in quoted symbol", + "[core][solvers][smt2]") +{ + std::istringstream in{"|unterminated"}; + smt2_tokenizert tokenizer{in}; + REQUIRE_THROWS_AS(tokenizer.next_token(), smt2_tokenizert::smt2_errort); +} + +TEST_CASE( + "smt2 tokenizer error on EOF in string literal", + "[core][solvers][smt2]") +{ + std::istringstream in{"\"unterminated"}; + smt2_tokenizert tokenizer{in}; + REQUIRE_THROWS_AS(tokenizer.next_token(), smt2_tokenizert::smt2_errort); +} + +TEST_CASE("smt2 tokenizer error on unknown numeral", "[core][solvers][smt2]") +{ + std::istringstream in{"#z"}; + smt2_tokenizert tokenizer{in}; + REQUIRE_THROWS_AS(tokenizer.next_token(), smt2_tokenizert::smt2_errort); +} + +TEST_CASE("smt2 tokenizer error on illegal character", "[core][solvers][smt2]") +{ + std::istringstream in{"\\"}; + smt2_tokenizert tokenizer{in}; + REQUIRE_THROWS_AS(tokenizer.next_token(), smt2_tokenizert::smt2_errort); +} + +TEST_CASE("is_smt2_simple_symbol_character", "[core][solvers][smt2]") +{ + REQUIRE(is_smt2_simple_symbol_character('a')); + REQUIRE(is_smt2_simple_symbol_character('Z')); + REQUIRE(is_smt2_simple_symbol_character('0')); + REQUIRE(is_smt2_simple_symbol_character('_')); + REQUIRE(is_smt2_simple_symbol_character('+')); + REQUIRE(is_smt2_simple_symbol_character('~')); + REQUIRE_FALSE(is_smt2_simple_symbol_character(' ')); + REQUIRE_FALSE(is_smt2_simple_symbol_character('(')); + REQUIRE_FALSE(is_smt2_simple_symbol_character(')')); + REQUIRE_FALSE(is_smt2_simple_symbol_character('"')); +}