From 252c6e7b2d93d7c04bffeb4b41f4a2554829670e Mon Sep 17 00:00:00 2001
From: Daniel Kroening <daniel.kroening@magd.ox.ac.uk>
Date: Wed, 6 May 2026 17:00:57 -0700
Subject: [PATCH] SMT2 tokenizer now has full tokent class

The SMT2 tokenizer now returns a full token class, including the token text
and the line number of the token.

This enables storing streams of SMT2 tokens.
---
 src/solvers/smt2/smt2_parser.cpp     | 246 ++++++++++++++++-----------
 src/solvers/smt2/smt2_solver.cpp     |   9 +-
 src/solvers/smt2/smt2_tokenizer.cpp  | 111 +++++++-----
 src/solvers/smt2/smt2_tokenizer.h    |  58 ++++---
 src/solvers/smt2/smt2irep.cpp        |   8 +-
 unit/Makefile                        |   1 +
 unit/solvers/smt2/smt2_tokenizer.cpp | 194 +++++++++++++++++++++
 7 files changed, 451 insertions(+), 176 deletions(-)
 create mode 100644 unit/solvers/smt2/smt2_tokenizer.cpp

diff --git a/src/solvers/smt2/smt2_parser.cpp b/src/solvers/smt2/smt2_parser.cpp
index f233905dd20..2274dd5c04e 100644
--- a/src/solvers/smt2/smt2_parser.cpp
+++ b/src/solvers/smt2/smt2_parser.cpp
@@ -56,13 +56,14 @@ void smt2_parsert::command_sequence()
     if(next_token() != smt2_tokenizert::OPEN)
       throw error("command must start with '('");
 
-    if(next_token() != smt2_tokenizert::SYMBOL)
+    auto cmd_token = next_token();
+    if(cmd_token != smt2_tokenizert::SYMBOL)
     {
       ignore_command();
       throw error("expected symbol as command");
     }
 
-    command(smt2_tokenizer.get_buffer());
+    command(cmd_token.text);
 
     switch(next_token())
     {
@@ -156,10 +157,11 @@ exprt smt2_parsert::let_expression()
   {
     next_token();
 
-    if(next_token() != smt2_tokenizert::SYMBOL)
+    auto binding_token = next_token();
+    if(binding_token != smt2_tokenizert::SYMBOL)
       throw error("expected symbol in binding");
 
-    irep_idt identifier = smt2_tokenizer.get_buffer();
+    irep_idt identifier = binding_token.text;
 
     // note that the previous bindings are _not_ visible yet
     exprt value=expression();
@@ -226,10 +228,11 @@ std::pair<binding_exprt::variablest, exprt> smt2_parsert::binding(irep_idt id)
   {
     next_token();
 
-    if(next_token() != smt2_tokenizert::SYMBOL)
+    auto binding_token = next_token();
+    if(binding_token != smt2_tokenizert::SYMBOL)
       throw error("expected symbol in binding");
 
-    irep_idt identifier = smt2_tokenizer.get_buffer();
+    irep_idt identifier = binding_token.text;
 
     typet type=sort();
 
@@ -487,27 +490,29 @@ exprt smt2_parsert::function_application_fp(const exprt::operandst &op)
 
 exprt smt2_parsert::function_application()
 {
-  switch(next_token())
+  auto token = next_token();
+
+  switch(token)
   {
   case smt2_tokenizert::SYMBOL:
-    if(smt2_tokenizer.get_buffer() == "_") // indexed identifier
+    if(token.text == "_") // indexed identifier
     {
       // indexed identifier
-      if(next_token() != smt2_tokenizert::SYMBOL)
+      auto id_token = next_token();
+      if(id_token != smt2_tokenizert::SYMBOL)
         throw error("expected symbol after '_'");
 
-      // copy, the reference won't be stable
-      const auto id = smt2_tokenizer.get_buffer();
+      const auto id = id_token.text;
 
       if(has_prefix(id, "bv"))
       {
-        mp_integer i = string2integer(
-          std::string(smt2_tokenizer.get_buffer(), 2, std::string::npos));
+        mp_integer i = string2integer(std::string(id, 2, std::string::npos));
 
-        if(next_token() != smt2_tokenizert::NUMERAL)
+        auto width_token = next_token();
+        if(width_token != smt2_tokenizert::NUMERAL)
           throw error("expected numeral as bitvector literal width");
 
-        auto width = std::stoll(smt2_tokenizer.get_buffer());
+        auto width = std::stoll(width_token.text);
 
         if(next_token() != smt2_tokenizert::CLOSE)
           throw error("expected ')' after bitvector literal");
@@ -518,15 +523,17 @@ exprt smt2_parsert::function_application()
       {
         // These are the "plus infinity", "minus infinity" and NaN
         // floating-point literals.
-        if(next_token() != smt2_tokenizert::NUMERAL)
+        auto e_token = next_token();
+        if(e_token != smt2_tokenizert::NUMERAL)
           throw error() << "expected number after " << id;
 
-        auto width_e = std::stoll(smt2_tokenizer.get_buffer());
+        auto width_e = std::stoll(e_token.text);
 
-        if(next_token() != smt2_tokenizert::NUMERAL)
+        auto f_token = next_token();
+        if(f_token != smt2_tokenizert::NUMERAL)
           throw error() << "expected second number after " << id;
 
-        auto width_f = std::stoll(smt2_tokenizer.get_buffer());
+        auto width_f = std::stoll(f_token.text);
 
         if(next_token() != smt2_tokenizert::CLOSE)
           throw error() << "expected ')' after " << id;
@@ -546,24 +553,24 @@ exprt smt2_parsert::function_application()
         throw error() << "unknown indexed identifier " << id;
       }
     }
-    else if(smt2_tokenizer.get_buffer() == "!")
+    else if(token.text == "!")
     {
       // these are "term attributes"
       const auto term = expression();
 
       while(smt2_tokenizer.peek() == smt2_tokenizert::KEYWORD)
       {
-        next_token(); // eat the keyword
-        if(smt2_tokenizer.get_buffer() == "named")
+        auto kw_token = next_token(); // eat the keyword
+        if(kw_token.text == "named")
         {
           // 'named terms' must be Boolean
           if(!term.is_boolean())
             throw error("named terms must be Boolean");
 
-          if(next_token() == smt2_tokenizert::SYMBOL)
+          auto name_token = next_token();
+          if(name_token == smt2_tokenizert::SYMBOL)
           {
-            const symbol_exprt symbol_expr(
-              smt2_tokenizer.get_buffer(), bool_typet());
+            const symbol_exprt symbol_expr(name_token.text, bool_typet());
             named_terms.emplace(
               symbol_expr.identifier(), named_termt(term, symbol_expr));
           }
@@ -582,7 +589,7 @@ exprt smt2_parsert::function_application()
     else
     {
       // non-indexed symbol, look up in expression table
-      const auto id = smt2_tokenizer.get_buffer();
+      const auto &id = token.text;
       const auto e_it = expressions.find(id);
       if(e_it != expressions.end())
         return e_it->second();
@@ -609,26 +616,29 @@ exprt smt2_parsert::function_application()
   case smt2_tokenizert::OPEN: // likely indexed identifier
     if(smt2_tokenizer.peek() == smt2_tokenizert::SYMBOL)
     {
-      next_token(); // eat symbol
-      if(smt2_tokenizer.get_buffer() == "_")
+      auto sym_token = next_token(); // eat symbol
+      if(sym_token.text == "_")
       {
         // indexed identifier
-        if(next_token() != smt2_tokenizert::SYMBOL)
+        auto id_token = next_token();
+        if(id_token != smt2_tokenizert::SYMBOL)
           throw error("expected symbol after '_'");
 
-        irep_idt id = smt2_tokenizer.get_buffer(); // hash it
+        irep_idt id = id_token.text; // hash it
 
         if(id=="extract")
         {
-          if(next_token() != smt2_tokenizert::NUMERAL)
+          auto upper_token = next_token();
+          if(upper_token != smt2_tokenizert::NUMERAL)
             throw error("expected numeral after extract");
 
-          auto upper = std::stoll(smt2_tokenizer.get_buffer());
+          auto upper = std::stoll(upper_token.text);
 
-          if(next_token() != smt2_tokenizert::NUMERAL)
+          auto lower_token = next_token();
+          if(lower_token != smt2_tokenizert::NUMERAL)
             throw error("expected two numerals after extract");
 
-          auto lower = std::stoll(smt2_tokenizer.get_buffer());
+          auto lower = std::stoll(lower_token.text);
 
           if(next_token() != smt2_tokenizert::CLOSE)
             throw error("expected ')' after extract");
@@ -653,10 +663,11 @@ exprt smt2_parsert::function_application()
                 id=="sign_extend" ||
                 id=="zero_extend")
         {
-          if(next_token() != smt2_tokenizert::NUMERAL)
+          auto index_token = next_token();
+          if(index_token != smt2_tokenizert::NUMERAL)
             throw error() << "expected numeral after " << id;
 
-          auto index = string2integer(smt2_tokenizer.get_buffer());
+          auto index = string2integer(index_token.text);
 
           if(next_token() != smt2_tokenizert::CLOSE)
             throw error() << "expected ')' after " << id << " index";
@@ -708,15 +719,17 @@ exprt smt2_parsert::function_application()
         }
         else if(id == "to_fp")
         {
-          if(next_token() != smt2_tokenizert::NUMERAL)
+          auto e_token = next_token();
+          if(e_token != smt2_tokenizert::NUMERAL)
             throw error("expected number after to_fp");
 
-          auto width_e = std::stoll(smt2_tokenizer.get_buffer());
+          auto width_e = std::stoll(e_token.text);
 
-          if(next_token() != smt2_tokenizert::NUMERAL)
+          auto f_token = next_token();
+          if(f_token != smt2_tokenizert::NUMERAL)
             throw error("expected second number after to_fp");
 
-          auto width_f = std::stoll(smt2_tokenizer.get_buffer());
+          auto width_f = std::stoll(f_token.text);
 
           if(next_token() != smt2_tokenizert::CLOSE)
             throw error("expected ')' after to_fp");
@@ -801,15 +814,17 @@ exprt smt2_parsert::function_application()
         }
         else if(id == "to_fp_unsigned")
         {
-          if(next_token() != smt2_tokenizert::NUMERAL)
+          auto e_token = next_token();
+          if(e_token != smt2_tokenizert::NUMERAL)
             throw error("expected number after to_fp_unsigned");
 
-          auto width_e = std::stoll(smt2_tokenizer.get_buffer());
+          auto width_e = std::stoll(e_token.text);
 
-          if(next_token() != smt2_tokenizert::NUMERAL)
+          auto f_token = next_token();
+          if(f_token != smt2_tokenizert::NUMERAL)
             throw error("expected second number after to_fp_unsigned");
 
-          auto width_f = std::stoll(smt2_tokenizer.get_buffer());
+          auto width_f = std::stoll(f_token.text);
 
           if(next_token() != smt2_tokenizert::CLOSE)
             throw error("expected ')' after to_fp_unsigned");
@@ -838,10 +853,11 @@ exprt smt2_parsert::function_application()
         else if(id == "fp.to_sbv" || id == "fp.to_ubv")
         {
           // These are indexed by the number of bits of the result.
-          if(next_token() != smt2_tokenizert::NUMERAL)
+          auto width_token = next_token();
+          if(width_token != smt2_tokenizert::NUMERAL)
             throw error() << "expected number after " << id;
 
-          auto width = std::stoll(smt2_tokenizer.get_buffer());
+          auto width = std::stoll(width_token.text);
 
           if(next_token() != smt2_tokenizert::CLOSE)
             throw error() << "expected ')' after " << id;
@@ -864,16 +880,15 @@ exprt smt2_parsert::function_application()
         }
         else
         {
-          throw error() << "unknown indexed identifier '"
-                        << smt2_tokenizer.get_buffer() << '\'';
+          throw error() << "unknown indexed identifier '" << id << '\'';
         }
       }
-      else if(smt2_tokenizer.get_buffer() == "as")
+      else if(sym_token.text == "as")
       {
         // This is an extension understood by Z3 and CVC4.
         if(
           smt2_tokenizer.peek() == smt2_tokenizert::SYMBOL &&
-          smt2_tokenizer.get_buffer() == "const")
+          smt2_tokenizer.peek().text == "const")
         {
           next_token(); // eat the "const"
           auto sort = this->sort();
@@ -886,7 +901,7 @@ exprt smt2_parsert::function_application()
 
           const auto &array_sort = to_array_type(sort);
 
-          if(smt2_tokenizer.next_token() != smt2_tokenizert::CLOSE)
+          if(next_token() != smt2_tokenizert::CLOSE)
             throw error() << "expecting ')' after sort in 'as const'";
 
           auto value = expression();
@@ -894,7 +909,7 @@ exprt smt2_parsert::function_application()
           if(value.type() != array_sort.element_type())
             throw error() << "unexpected 'as const' with wrong element type";
 
-          if(smt2_tokenizer.next_token() != smt2_tokenizert::CLOSE)
+          if(next_token() != smt2_tokenizert::CLOSE)
             throw error() << "expecting ')' at the end of 'as const'";
 
           return array_of_exprt(value, array_sort);
@@ -1010,34 +1025,36 @@ exprt smt2_parsert::bv_mod(const exprt::operandst &operands, bool is_signed)
 
 exprt smt2_parsert::expression()
 {
-  switch(next_token())
+  auto token = next_token();
+
+  switch(token)
   {
   case smt2_tokenizert::SYMBOL:
-    {
-      const auto &identifier = smt2_tokenizer.get_buffer();
-
-      // in the expression table?
-      const auto e_it = expressions.find(identifier);
-      if(e_it != expressions.end())
-        return e_it->second();
+  {
+    const auto &identifier = token.text;
 
-      // rummage through id_map
-      auto id_it = id_map.find(identifier);
-      if(id_it != id_map.end())
-      {
-        symbol_exprt symbol_expr(identifier, id_it->second.type);
-        if(smt2_tokenizer.token_is_quoted_symbol())
-          symbol_expr.set(ID_C_quoted, true);
-        return std::move(symbol_expr);
-      }
+    // in the expression table?
+    const auto e_it = expressions.find(identifier);
+    if(e_it != expressions.end())
+      return e_it->second();
 
-      // don't know, give up
-      throw error() << "unknown expression '" << identifier << '\'';
+    // rummage through id_map
+    auto id_it = id_map.find(identifier);
+    if(id_it != id_map.end())
+    {
+      symbol_exprt symbol_expr(identifier, id_it->second.type);
+      if(token.quoted_symbol)
+        symbol_expr.set(ID_C_quoted, true);
+      return std::move(symbol_expr);
     }
 
+    // don't know, give up
+    throw error() << "unknown expression '" << identifier << '\'';
+  }
+
   case smt2_tokenizert::NUMERAL:
   {
-    const std::string &buffer = smt2_tokenizer.get_buffer();
+    const std::string &buffer = token.text;
     if(buffer.size() >= 2 && buffer[0] == '#' && buffer[1] == 'x')
     {
       mp_integer value =
@@ -1491,36 +1508,45 @@ typet smt2_parsert::sort()
   // SYMBOL
   // ( _ SYMBOL ...
   // ( SYMBOL ...
-  switch(next_token())
+  auto sort_token = next_token();
+
+  switch(sort_token)
   {
   case smt2_tokenizert::SYMBOL:
     break;
 
   case smt2_tokenizert::OPEN:
-    if(smt2_tokenizer.next_token() != smt2_tokenizert::SYMBOL)
+  {
+    auto inner_token = next_token();
+    if(inner_token != smt2_tokenizert::SYMBOL)
       throw error("expected symbol after '(' in a sort ");
 
-    if(smt2_tokenizer.get_buffer() == "_")
+    if(inner_token.text == "_")
     {
-      if(next_token() != smt2_tokenizert::SYMBOL)
+      sort_token = next_token();
+      if(sort_token != smt2_tokenizert::SYMBOL)
         throw error("expected symbol after '_' in a sort");
     }
+    else
+    {
+      sort_token = std::move(inner_token);
+    }
     break;
+  }
 
   case smt2_tokenizert::CLOSE:
   case smt2_tokenizert::NUMERAL:
   case smt2_tokenizert::STRING_LITERAL:
   case smt2_tokenizert::NONE:
   case smt2_tokenizert::KEYWORD:
-    throw error() << "unexpected token in a sort: '"
-                  << smt2_tokenizer.get_buffer() << '\'';
+    throw error() << "unexpected token in a sort: '" << sort_token.text << '\'';
 
   case smt2_tokenizert::END_OF_FILE:
     throw error() << "unexpected end-of-file in a sort";
   }
 
   // now we have a SYMBOL
-  const auto &token = smt2_tokenizer.get_buffer();
+  const auto &token = sort_token.text;
 
   const auto s_it = sorts.find(token);
 
@@ -1549,11 +1575,13 @@ void smt2_parsert::setup_sorts()
     return ieee_float_spect::quadruple_precision().to_type();
   };
 
-  sorts["BitVec"] = [this] {
-    if(next_token() != smt2_tokenizert::NUMERAL)
+  sorts["BitVec"] = [this]
+  {
+    auto width_token = next_token();
+    if(width_token != smt2_tokenizert::NUMERAL)
       throw error("expected numeral as bit-width");
 
-    auto width = std::stoll(smt2_tokenizer.get_buffer());
+    auto width = std::stoll(width_token.text);
 
     // eat the ')'
     if(next_token() != smt2_tokenizert::CLOSE)
@@ -1562,16 +1590,19 @@ void smt2_parsert::setup_sorts()
     return unsignedbv_typet(width);
   };
 
-  sorts["FloatingPoint"] = [this] {
-    if(next_token() != smt2_tokenizert::NUMERAL)
+  sorts["FloatingPoint"] = [this]
+  {
+    auto e_token = next_token();
+    if(e_token != smt2_tokenizert::NUMERAL)
       throw error("expected numeral as bit-width");
 
-    const auto width_e = std::stoll(smt2_tokenizer.get_buffer());
+    const auto width_e = std::stoll(e_token.text);
 
-    if(next_token() != smt2_tokenizert::NUMERAL)
+    auto f_token = next_token();
+    if(f_token != smt2_tokenizert::NUMERAL)
       throw error("expected numeral as bit-width");
 
-    const auto width_f = std::stoll(smt2_tokenizer.get_buffer());
+    const auto width_f = std::stoll(f_token.text);
 
     // consume the ')'
     if(next_token() != smt2_tokenizert::CLOSE)
@@ -1621,10 +1652,11 @@ smt2_parsert::function_signature_definition()
     if(next_token() != smt2_tokenizert::OPEN)
       throw error("expected '(' at beginning of parameter");
 
-    if(next_token() != smt2_tokenizert::SYMBOL)
+    auto param_token = next_token();
+    if(param_token != smt2_tokenizert::SYMBOL)
       throw error("expected symbol in parameter");
 
-    irep_idt id = smt2_tokenizer.get_buffer();
+    irep_idt id = param_token.text;
     domain.push_back(sort());
     parameters.push_back(id);
 
@@ -1677,13 +1709,13 @@ void smt2_parsert::command(const std::string &c)
 
 void smt2_parsert::setup_commands()
 {
-  commands["declare-const"] = [this]() {
-    const auto s = smt2_tokenizer.get_buffer();
-
-    if(next_token() != smt2_tokenizert::SYMBOL)
-      throw error() << "expected a symbol after " << s;
+  commands["declare-const"] = [this]()
+  {
+    auto id_token = next_token();
+    if(id_token != smt2_tokenizert::SYMBOL)
+      throw error("expected a symbol after declare-const");
 
-    irep_idt id = smt2_tokenizer.get_buffer();
+    irep_idt id = id_token.text;
     auto type = sort();
 
     add_unique_id(id, exprt(ID_nil, type));
@@ -1693,21 +1725,25 @@ void smt2_parsert::setup_commands()
   // accepted by Z3 and CVC4
   commands["declare-var"] = commands["declare-const"];
 
-  commands["declare-fun"] = [this]() {
-    if(next_token() != smt2_tokenizert::SYMBOL)
+  commands["declare-fun"] = [this]()
+  {
+    auto id_token = next_token();
+    if(id_token != smt2_tokenizert::SYMBOL)
       throw error("expected a symbol after declare-fun");
 
-    irep_idt id = smt2_tokenizer.get_buffer();
+    irep_idt id = id_token.text;
     auto type = function_signature_declaration();
 
     add_unique_id(id, exprt(ID_nil, type));
   };
 
-  commands["define-const"] = [this]() {
-    if(next_token() != smt2_tokenizert::SYMBOL)
+  commands["define-const"] = [this]()
+  {
+    auto id_token = next_token();
+    if(id_token != smt2_tokenizert::SYMBOL)
       throw error("expected a symbol after define-const");
 
-    const irep_idt id = smt2_tokenizer.get_buffer();
+    const irep_idt id = id_token.text;
 
     const auto type = sort();
     const auto value = expression();
@@ -1724,11 +1760,13 @@ void smt2_parsert::setup_commands()
     add_unique_id(id, value);
   };
 
-  commands["define-fun"] = [this]() {
-    if(next_token() != smt2_tokenizert::SYMBOL)
+  commands["define-fun"] = [this]()
+  {
+    auto id_token = next_token();
+    if(id_token != smt2_tokenizert::SYMBOL)
       throw error("expected a symbol after define-fun");
 
-    const irep_idt id = smt2_tokenizer.get_buffer();
+    const irep_idt id = id_token.text;
 
     const auto signature = function_signature_definition();
 
diff --git a/src/solvers/smt2/smt2_solver.cpp b/src/solvers/smt2/smt2_solver.cpp
index 9d0652f7989..29aae16e2a6 100644
--- a/src/solvers/smt2/smt2_solver.cpp
+++ b/src/solvers/smt2/smt2_solver.cpp
@@ -260,12 +260,13 @@ void smt2_solvert::setup_commands()
       std::cout << ")\n";
     };
 
-    commands["echo"] = [this]() {
-      if(next_token() != smt2_tokenizert::STRING_LITERAL)
+    commands["echo"] = [this]()
+    {
+      auto str_token = next_token();
+      if(str_token != smt2_tokenizert::STRING_LITERAL)
         throw error("expected string literal");
 
-      std::cout << smt2_format(constant_exprt(
-                     smt2_tokenizer.get_buffer(), string_typet()))
+      std::cout << smt2_format(constant_exprt(str_token.text, string_typet()))
                 << '\n';
     };
 
diff --git a/src/solvers/smt2/smt2_tokenizer.cpp b/src/solvers/smt2/smt2_tokenizer.cpp
index 6272bb8d5a8..7c02269714a 100644
--- a/src/solvers/smt2/smt2_tokenizer.cpp
+++ b/src/solvers/smt2/smt2_tokenizer.cpp
@@ -27,30 +27,35 @@ smt2_tokenizert::tokent smt2_tokenizert::get_simple_symbol()
   // ~ ! @ $ % ^ & * _ - + = < > . ? /
   // that does not start with a digit and is not a reserved word.
 
-  buffer.clear();
+  token.text.clear();
 
   char ch;
   while(in->get(ch))
   {
     if(is_smt2_simple_symbol_character(ch))
     {
-      buffer+=ch;
+      token.text += ch;
     }
     else
     {
       in->unget(); // put back
-      quoted_symbol = false;
-      return SYMBOL;
+      token.quoted_symbol = false;
+      token.kind = SYMBOL;
+      return token;
     }
   }
 
   // eof -- this is ok here
-  if(buffer.empty())
-    return END_OF_FILE;
+  if(token.text.empty())
+  {
+    token.kind = END_OF_FILE;
+    return token;
+  }
   else
   {
-    quoted_symbol = false;
-    return SYMBOL;
+    token.quoted_symbol = false;
+    token.kind = SYMBOL;
+    return token;
   }
 }
 
@@ -58,85 +63,94 @@ smt2_tokenizert::tokent smt2_tokenizert::get_decimal_numeral()
 {
   // we accept any sequence of digits and dots
 
-  buffer.clear();
+  token.text.clear();
 
   char ch;
   while(in->get(ch))
   {
     if(isdigit(ch) || ch=='.')
     {
-      buffer+=ch;
+      token.text += ch;
     }
     else
     {
       in->unget(); // put back
-      return NUMERAL;
+      token.kind = NUMERAL;
+      return token;
     }
   }
 
   // eof -- this is ok here
-  if(buffer.empty())
-    return END_OF_FILE;
+  if(token.text.empty())
+    token.kind = END_OF_FILE;
   else
-    return NUMERAL;
+    token.kind = NUMERAL;
+
+  return token;
 }
 
 smt2_tokenizert::tokent smt2_tokenizert::get_bin_numeral()
 {
   // we accept any sequence of '0' or '1'
 
-  buffer.clear();
-  buffer+='#';
-  buffer+='b';
+  token.text.clear();
+  token.text += '#';
+  token.text += 'b';
 
   char ch;
   while(in->get(ch))
   {
     if(ch=='0' || ch=='1')
     {
-      buffer+=ch;
+      token.text += ch;
     }
     else
     {
       in->unget(); // put back
-      return NUMERAL;
+      token.kind = NUMERAL;
+      return token;
     }
   }
 
   // eof -- this is ok here
-  if(buffer.empty())
-    return END_OF_FILE;
+  if(token.text.empty())
+    token.kind = END_OF_FILE;
   else
-    return NUMERAL;
+    token.kind = NUMERAL;
+
+  return token;
 }
 
 smt2_tokenizert::tokent smt2_tokenizert::get_hex_numeral()
 {
   // we accept any sequence of '0'-'9', 'a'-'f', 'A'-'F'
 
-  buffer.clear();
-  buffer+='#';
-  buffer+='x';
+  token.text.clear();
+  token.text += '#';
+  token.text += 'x';
 
   char ch;
   while(in->get(ch))
   {
     if(isxdigit(ch))
     {
-      buffer+=ch;
+      token.text += ch;
     }
     else
     {
       in->unget(); // put back
-      return NUMERAL;
+      token.kind = NUMERAL;
+      return token;
     }
   }
 
   // eof -- this is ok here
-  if(buffer.empty())
-    return END_OF_FILE;
+  if(token.text.empty())
+    token.kind = END_OF_FILE;
   else
-    return NUMERAL;
+    token.kind = NUMERAL;
+
+  return token;
 }
 
 smt2_tokenizert::tokent smt2_tokenizert::get_quoted_symbol()
@@ -146,21 +160,22 @@ smt2_tokenizert::tokent smt2_tokenizert::get_quoted_symbol()
   // character \, that starts and ends with | and does not otherwise
   // contain |
 
-  buffer.clear();
+  token.text.clear();
 
   char ch;
   while(in->get(ch))
   {
     if(ch=='|')
     {
-      quoted_symbol = true;
-      return SYMBOL; // done
+      token.quoted_symbol = true;
+      token.kind = SYMBOL; // done
+      return token;
     }
 
-    buffer+=ch;
+    token.text += ch;
 
     if(ch=='\n')
-      line_no++;
+      token.line_no++;
   }
 
   // Hmpf. Eof before end of quoted symbol. This is an error.
@@ -169,7 +184,7 @@ smt2_tokenizert::tokent smt2_tokenizert::get_quoted_symbol()
 
 smt2_tokenizert::tokent smt2_tokenizert::get_string_literal()
 {
-  buffer.clear();
+  token.text.clear();
 
   char ch;
   while(in->get(ch))
@@ -185,13 +200,17 @@ smt2_tokenizert::tokent smt2_tokenizert::get_string_literal()
         else
         {
           in->unget();
-          return STRING_LITERAL; // done
+          token.kind = STRING_LITERAL;
+          return token; // done
         }
       }
       else
-        return STRING_LITERAL; // done
+      {
+        token.kind = STRING_LITERAL;
+        return token; // done
+      }
     }
-    buffer+=ch;
+    token.text += ch;
   }
 
   // Hmpf. Eof before end of string literal. This is an error.
@@ -205,7 +224,7 @@ smt2_tokenizert::tokent smt2_tokenizert::next_token()
   else
     get_token_from_stream();
 
-  return token;
+  return token.move_me();
 }
 
 void smt2_tokenizert::get_token_from_stream()
@@ -217,7 +236,7 @@ void smt2_tokenizert::get_token_from_stream()
     switch(ch)
     {
     case '\n':
-      line_no++;
+      token.line_no++;
       break;
 
     case ' ':
@@ -233,7 +252,7 @@ void smt2_tokenizert::get_token_from_stream()
       {
         if(ch=='\n')
         {
-          line_no++;
+          token.line_no++;
           break;
         }
       }
@@ -241,12 +260,12 @@ void smt2_tokenizert::get_token_from_stream()
 
     case '(':
       // produce sub-expression
-      token = OPEN;
+      token.kind = OPEN;
       return;
 
     case ')':
       // done with sub-expression
-      token = CLOSE;
+      token.kind = CLOSE;
       return;
 
     case '|': // quoted symbol
@@ -261,7 +280,7 @@ void smt2_tokenizert::get_token_from_stream()
       token = get_simple_symbol();
       if(token == SYMBOL)
       {
-        token = KEYWORD;
+        token.kind = KEYWORD;
         return;
       }
       else
@@ -308,5 +327,5 @@ void smt2_tokenizert::get_token_from_stream()
     }
   }
 
-  token = END_OF_FILE;
+  token.kind = END_OF_FILE;
 }
diff --git a/src/solvers/smt2/smt2_tokenizer.h b/src/solvers/smt2/smt2_tokenizer.h
index a95fdcfb361..5790e2d3708 100644
--- a/src/solvers/smt2/smt2_tokenizer.h
+++ b/src/solvers/smt2/smt2_tokenizer.h
@@ -15,10 +15,10 @@ Author: Daniel Kroening, kroening@kroening.com
 class smt2_tokenizert
 {
 public:
-  explicit smt2_tokenizert(std::istream &_in) : peeked(false), token(NONE)
+  explicit smt2_tokenizert(std::istream &_in) : peeked(false)
   {
     in=&_in;
-    line_no=1;
+    token.line_no = 1;
   }
 
   class smt2_errort
@@ -63,7 +63,7 @@ class smt2_tokenizert
     unsigned line_no;
   };
 
-  using tokent = enum {
+  using token_kindt = enum {
     NONE,
     END_OF_FILE,
     STRING_LITERAL,
@@ -74,9 +74,42 @@ class smt2_tokenizert
     CLOSE
   };
 
+  class tokent
+  {
+  public:
+    tokent() = default;
+    explicit tokent(token_kindt _kind) : kind(_kind)
+    {
+    }
+
+    token_kindt kind = NONE;
+    std::string text;
+    unsigned line_no = 0;
+    bool quoted_symbol = false;
+
+    // conversion
+    operator token_kindt() const
+    {
+      return kind;
+    }
+
+    // Return move-copy of this token, but preserve
+    // the line number.
+    tokent move_me()
+    {
+      tokent result;
+      result.kind = kind;
+      result.line_no = line_no;
+      result.text = std::move(text);
+      result.quoted_symbol = quoted_symbol;
+      kind = NONE; // the token is now destroyed
+      return result;
+    }
+  };
+
   tokent next_token();
 
-  tokent peek()
+  const tokent &peek()
   {
     if(peeked)
       return token;
@@ -88,33 +121,20 @@ class smt2_tokenizert
     }
   }
 
-  const std::string &get_buffer() const
-  {
-    return buffer;
-  }
-
-  bool token_is_quoted_symbol() const
-  {
-    return quoted_symbol;
-  }
-
   /// generate an error exception, pre-filled with a message
   smt2_errort error(const std::string &message) const
   {
-    return smt2_errort(message, line_no);
+    return smt2_errort(message, token.line_no);
   }
 
   /// generate an error exception
   smt2_errort error() const
   {
-    return smt2_errort(line_no);
+    return smt2_errort(token.line_no);
   }
 
 protected:
   std::istream *in;
-  unsigned line_no;
-  std::string buffer;
-  bool quoted_symbol = false;
   bool peeked;
   tokent token;
 
diff --git a/src/solvers/smt2/smt2irep.cpp b/src/solvers/smt2/smt2irep.cpp
index 9eeeb58921a..d8a5d6ec8c5 100644
--- a/src/solvers/smt2/smt2irep.cpp
+++ b/src/solvers/smt2/smt2irep.cpp
@@ -36,7 +36,9 @@ std::optional<irept> smt2irept::operator()()
 
     while(true)
     {
-      switch(next_token())
+      auto token = next_token();
+
+      switch(token)
       {
       case END_OF_FILE:
         if(stack.empty())
@@ -48,9 +50,9 @@ std::optional<irept> smt2irept::operator()()
       case NUMERAL:
       case SYMBOL:
         if(stack.empty())
-          return irept(buffer); // all done!
+          return irept{token.text}; // all done!
         else
-          stack.top().get_sub().push_back(irept(buffer));
+          stack.top().get_sub().push_back(irept{token.text});
         break;
 
       case OPEN: // '('
diff --git a/unit/Makefile b/unit/Makefile
index fa46a11d30c..d26c3cfc10e 100644
--- a/unit/Makefile
+++ b/unit/Makefile
@@ -112,6 +112,7 @@ SRC += analyses/ai/ai.cpp \
        solvers/sat/satcheck_minisat2.cpp \
        solvers/smt2/smt2_conv.cpp \
        solvers/smt2/smt2irep.cpp \
+       solvers/smt2/smt2_tokenizer.cpp \
        solvers/smt2_incremental/ast/smt_commands.cpp \
        solvers/smt2_incremental/ast/smt_index.cpp \
        solvers/smt2_incremental/ast/smt_responses.cpp \
diff --git a/unit/solvers/smt2/smt2_tokenizer.cpp b/unit/solvers/smt2/smt2_tokenizer.cpp
new file mode 100644
index 00000000000..d0635722802
--- /dev/null
+++ b/unit/solvers/smt2/smt2_tokenizer.cpp
@@ -0,0 +1,194 @@
+// Author: Daniel Kroening
+
+/// \file
+/// Unit tests for smt2_tokenizert
+
+#include <solvers/smt2/smt2_tokenizer.h>
+#include <testing-utils/use_catch.h>
+
+#include <sstream>
+
+TEST_CASE("smt2 tokenizer end of file", "[core][solvers][smt2]")
+{
+  std::istringstream in{""};
+  smt2_tokenizert tokenizer{in};
+  auto tok = tokenizer.next_token();
+  REQUIRE(tok.kind == smt2_tokenizert::END_OF_FILE);
+}
+
+TEST_CASE("smt2 tokenizer open/close parentheses", "[core][solvers][smt2]")
+{
+  std::istringstream in{"()"};
+  smt2_tokenizert tokenizer{in};
+  auto tok1 = tokenizer.next_token();
+  REQUIRE(tok1.kind == smt2_tokenizert::OPEN);
+  auto tok2 = tokenizer.next_token();
+  REQUIRE(tok2.kind == smt2_tokenizert::CLOSE);
+  auto tok3 = tokenizer.next_token();
+  REQUIRE(tok3.kind == smt2_tokenizert::END_OF_FILE);
+}
+
+TEST_CASE("smt2 tokenizer simple symbol", "[core][solvers][smt2]")
+{
+  std::istringstream in{"abc"};
+  smt2_tokenizert tokenizer{in};
+  auto tok = tokenizer.next_token();
+  REQUIRE(tok.kind == smt2_tokenizert::SYMBOL);
+  REQUIRE(tok.text == "abc");
+  REQUIRE_FALSE(tok.quoted_symbol);
+}
+
+TEST_CASE("smt2 tokenizer quoted symbol", "[core][solvers][smt2]")
+{
+  std::istringstream in{"|hello world|"};
+  smt2_tokenizert tokenizer{in};
+  auto tok = tokenizer.next_token();
+  REQUIRE(tok.kind == smt2_tokenizert::SYMBOL);
+  REQUIRE(tok.text == "hello world");
+  REQUIRE(tok.quoted_symbol);
+}
+
+TEST_CASE("smt2 tokenizer string literal", "[core][solvers][smt2]")
+{
+  std::istringstream in{"\"hello\""};
+  smt2_tokenizert tokenizer{in};
+  auto tok = tokenizer.next_token();
+  REQUIRE(tok.kind == smt2_tokenizert::STRING_LITERAL);
+  REQUIRE(tok.text == "hello");
+}
+
+TEST_CASE("smt2 tokenizer escaped quotes in string", "[core][solvers][smt2]")
+{
+  std::istringstream in{"\"say \"\"hi\"\"\""};
+  smt2_tokenizert tokenizer{in};
+  auto tok = tokenizer.next_token();
+  REQUIRE(tok.kind == smt2_tokenizert::STRING_LITERAL);
+  REQUIRE(tok.text == "say \"hi\"");
+}
+
+TEST_CASE("smt2 tokenizer decimal numeral", "[core][solvers][smt2]")
+{
+  std::istringstream in{"42"};
+  smt2_tokenizert tokenizer{in};
+  auto tok = tokenizer.next_token();
+  REQUIRE(tok.kind == smt2_tokenizert::NUMERAL);
+  REQUIRE(tok.text == "42");
+}
+
+TEST_CASE("smt2 tokenizer binary numeral", "[core][solvers][smt2]")
+{
+  std::istringstream in{"#b1010"};
+  smt2_tokenizert tokenizer{in};
+  auto tok = tokenizer.next_token();
+  REQUIRE(tok.kind == smt2_tokenizert::NUMERAL);
+  REQUIRE(tok.text == "#b1010");
+}
+
+TEST_CASE("smt2 tokenizer hex numeral", "[core][solvers][smt2]")
+{
+  std::istringstream in{"#xFF"};
+  smt2_tokenizert tokenizer{in};
+  auto tok = tokenizer.next_token();
+  REQUIRE(tok.kind == smt2_tokenizert::NUMERAL);
+  REQUIRE(tok.text == "#xFF");
+}
+
+TEST_CASE("smt2 tokenizer keyword", "[core][solvers][smt2]")
+{
+  std::istringstream in{":status"};
+  smt2_tokenizert tokenizer{in};
+  auto tok = tokenizer.next_token();
+  REQUIRE(tok.kind == smt2_tokenizert::KEYWORD);
+  REQUIRE(tok.text == "status");
+}
+
+TEST_CASE("smt2 tokenizer skips whitespace", "[core][solvers][smt2]")
+{
+  std::istringstream in{"  \t\r\n abc"};
+  smt2_tokenizert tokenizer{in};
+  auto tok = tokenizer.next_token();
+  REQUIRE(tok.kind == smt2_tokenizert::SYMBOL);
+  REQUIRE(tok.text == "abc");
+}
+
+TEST_CASE("smt2 tokenizer skips comments", "[core][solvers][smt2]")
+{
+  std::istringstream in{"; this is a comment\nabc"};
+  smt2_tokenizert tokenizer{in};
+  auto tok = tokenizer.next_token();
+  REQUIRE(tok.kind == smt2_tokenizert::SYMBOL);
+  REQUIRE(tok.text == "abc");
+}
+
+TEST_CASE("smt2 tokenizer peek", "[core][solvers][smt2]")
+{
+  std::istringstream in{"abc def"};
+  smt2_tokenizert tokenizer{in};
+  const auto &peeked = tokenizer.peek();
+  REQUIRE(peeked.kind == smt2_tokenizert::SYMBOL);
+  REQUIRE(peeked.text == "abc");
+  // peek again returns same token
+  const auto &peeked2 = tokenizer.peek();
+  REQUIRE(peeked2.text == "abc");
+  // next_token consumes the peeked token
+  auto tok = tokenizer.next_token();
+  REQUIRE(tok.text == "abc");
+  auto tok2 = tokenizer.next_token();
+  REQUIRE(tok2.text == "def");
+}
+
+TEST_CASE("smt2 tokenizer line number tracking", "[core][solvers][smt2]")
+{
+  std::istringstream in{"abc\ndef"};
+  smt2_tokenizert tokenizer{in};
+  auto tok1 = tokenizer.next_token();
+  REQUIRE(tok1.line_no == 1);
+  auto tok2 = tokenizer.next_token();
+  REQUIRE(tok2.line_no == 2);
+}
+
+TEST_CASE(
+  "smt2 tokenizer error on EOF in quoted symbol",
+  "[core][solvers][smt2]")
+{
+  std::istringstream in{"|unterminated"};
+  smt2_tokenizert tokenizer{in};
+  REQUIRE_THROWS_AS(tokenizer.next_token(), smt2_tokenizert::smt2_errort);
+}
+
+TEST_CASE(
+  "smt2 tokenizer error on EOF in string literal",
+  "[core][solvers][smt2]")
+{
+  std::istringstream in{"\"unterminated"};
+  smt2_tokenizert tokenizer{in};
+  REQUIRE_THROWS_AS(tokenizer.next_token(), smt2_tokenizert::smt2_errort);
+}
+
+TEST_CASE("smt2 tokenizer error on unknown numeral", "[core][solvers][smt2]")
+{
+  std::istringstream in{"#z"};
+  smt2_tokenizert tokenizer{in};
+  REQUIRE_THROWS_AS(tokenizer.next_token(), smt2_tokenizert::smt2_errort);
+}
+
+TEST_CASE("smt2 tokenizer error on illegal character", "[core][solvers][smt2]")
+{
+  std::istringstream in{"\\"};
+  smt2_tokenizert tokenizer{in};
+  REQUIRE_THROWS_AS(tokenizer.next_token(), smt2_tokenizert::smt2_errort);
+}
+
+TEST_CASE("is_smt2_simple_symbol_character", "[core][solvers][smt2]")
+{
+  REQUIRE(is_smt2_simple_symbol_character('a'));
+  REQUIRE(is_smt2_simple_symbol_character('Z'));
+  REQUIRE(is_smt2_simple_symbol_character('0'));
+  REQUIRE(is_smt2_simple_symbol_character('_'));
+  REQUIRE(is_smt2_simple_symbol_character('+'));
+  REQUIRE(is_smt2_simple_symbol_character('~'));
+  REQUIRE_FALSE(is_smt2_simple_symbol_character(' '));
+  REQUIRE_FALSE(is_smt2_simple_symbol_character('('));
+  REQUIRE_FALSE(is_smt2_simple_symbol_character(')'));
+  REQUIRE_FALSE(is_smt2_simple_symbol_character('"'));
+}