From baecdf7d1e4d404d30f80ae5b26f486ca833ae03 Mon Sep 17 00:00:00 2001 From: Raphael Bernardo Date: Mon, 23 Mar 2026 04:37:13 -0300 Subject: [PATCH] fix: add missing pre-tokenizer type for GPT-2 BPE models The GGUF conversion scripts do not write the `tokenizer.ggml.pre` metadata key for BitNet models, causing llama.cpp to fall back to the default pre-tokenizer. This produces degraded or incoherent output with the warning: "missing pre-tokenizer type, using: 'default'" "GENERATION QUALITY WILL BE DEGRADED!" Root cause: - convert-hf-to-gguf-bitnet.py: BitnetModel.set_vocab() calls _set_vocab_sentencepiece() which hardcodes pre="default", instead of _set_vocab_gpt2() which correctly detects and writes the pre-tokenizer type. - convert-ms-to-gguf-bitnet.py: add_meta_vocab() writes the tokenizer model but never writes the pre-tokenizer type. Fix: - Change BitnetModel.set_vocab() to call _set_vocab_gpt2() - Add add_token_pre_type("gpt-2") in add_meta_vocab() for GPT-2 models Tested on Mac Mini M4 (ARM64) with BitNet-b1.58-2B-4T: reconverted model produces coherent output at ~41 tokens/sec via bitnet.cpp, matching the quality seen through HuggingFace transformers. --- utils/convert-hf-to-gguf-bitnet.py | 2 +- utils/convert-ms-to-gguf-bitnet.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/utils/convert-hf-to-gguf-bitnet.py b/utils/convert-hf-to-gguf-bitnet.py index 23e84384c..0a6b8c560 100644 --- a/utils/convert-hf-to-gguf-bitnet.py +++ b/utils/convert-hf-to-gguf-bitnet.py @@ -957,7 +957,7 @@ class BitnetModel(Model): model_arch = gguf.MODEL_ARCH.BITNET def set_vocab(self): - self._set_vocab_sentencepiece() + self._set_vocab_gpt2() def set_gguf_parameters(self): super().set_gguf_parameters() diff --git a/utils/convert-ms-to-gguf-bitnet.py b/utils/convert-ms-to-gguf-bitnet.py index edf702788..1b9b8d901 100644 --- a/utils/convert-ms-to-gguf-bitnet.py +++ b/utils/convert-ms-to-gguf-bitnet.py @@ -1208,6 +1208,9 @@ def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list def add_meta_vocab(self, vocab: Vocab) -> None: # Ensure that tokenizer_model is added to the GGUF model self.gguf.add_tokenizer_model(vocab.tokenizer_model) + # Add pre-tokenizer type for BPE models (required for correct tokenization) + if vocab.tokenizer_model == "gpt2": + self.gguf.add_token_pre_type("gpt-2") # Extract model vocabulary for model conversion tokens, scores, toktypes = self.extract_vocabulary_from_model(vocab)