From d519daf9b4647c76a061b436317340c260437bbb Mon Sep 17 00:00:00 2001
From: Richard Liu <2645345468@qq.com>
Date: Thu, 19 Mar 2026 23:35:44 +0800
Subject: [PATCH 1/5] feat: add mimo tts provider support
---
astrbot/core/config/default.py | 42 +++++
astrbot/core/provider/manager.py | 4 +
.../provider/sources/mimo_tts_api_source.py | 149 ++++++++++++++++++
.../src/composables/useProviderSources.ts | 1 +
.../en-US/features/config-metadata.json | 22 ++-
.../ru-RU/features/config-metadata.json | 22 ++-
.../zh-CN/features/config-metadata.json | 22 ++-
dashboard/src/utils/providerUtils.js | 1 +
8 files changed, 260 insertions(+), 3 deletions(-)
create mode 100644 astrbot/core/provider/sources/mimo_tts_api_source.py
diff --git a/astrbot/core/config/default.py b/astrbot/core/config/default.py
index 58c1726814..afba850193 100644
--- a/astrbot/core/config/default.py
+++ b/astrbot/core/config/default.py
@@ -1461,6 +1461,23 @@ class ChatProviderTemplate(TypedDict):
"timeout": "20",
"proxy": "",
},
+ "MiMo TTS(API)": {
+ "id": "mimo_tts",
+ "type": "mimo_tts_api",
+ "provider": "mimo",
+ "provider_type": "text_to_speech",
+ "enable": False,
+ "api_key": "",
+ "api_base": "https://api.xiaomimimo.com/v1",
+ "model": "mimo-v2-tts",
+ "mimo-tts-voice": "mimo_default",
+ "mimo-tts-format": "wav",
+ "mimo-tts-style-prompt": "",
+ "mimo-tts-dialect": "",
+ "mimo-tts-seed-text": "Hello, MiMo, have you had lunch?",
+ "timeout": "20",
+ "proxy": "",
+ },
"Genie TTS": {
"id": "genie_tts",
"provider": "genie_tts",
@@ -2314,6 +2331,31 @@ class ChatProviderTemplate(TypedDict):
"type": "string",
"hint": "OpenAI TTS 的声音。OpenAI 默认支持:'alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'",
},
+ "mimo-tts-voice": {
+ "description": "音色",
+ "type": "string",
+ "hint": "MiMo TTS 的音色名称。默认值为 'mimo_default'。",
+ },
+ "mimo-tts-format": {
+ "description": "输出格式",
+ "type": "string",
+ "hint": "MiMo TTS 生成音频的格式,例如 'wav'。",
+ },
+ "mimo-tts-style-prompt": {
+ "description": "风格提示词",
+ "type": "string",
+ "hint": "用于控制生成语音的说话风格、语气或情绪,例如温柔、活泼、沉稳等。可留空。",
+ },
+ "mimo-tts-dialect": {
+ "description": "方言",
+ "type": "string",
+ "hint": "指定生成语音时使用的方言或口音,例如四川话、粤语口音等。可留空。",
+ },
+ "mimo-tts-seed-text": {
+ "description": "种子文本",
+ "type": "string",
+ "hint": "用于引导音色和说话方式的参考文本,会影响生成语音的表达风格。",
+ },
"fishaudio-tts-character": {
"description": "character",
"type": "string",
diff --git a/astrbot/core/provider/manager.py b/astrbot/core/provider/manager.py
index 0df9f791ae..62e1fcd80f 100644
--- a/astrbot/core/provider/manager.py
+++ b/astrbot/core/provider/manager.py
@@ -399,6 +399,10 @@ def dynamic_import_provider(self, type: str) -> None:
from .sources.openai_tts_api_source import (
ProviderOpenAITTSAPI as ProviderOpenAITTSAPI,
)
+ case "mimo_tts_api":
+ from .sources.mimo_tts_api_source import (
+ ProviderMiMoTTSAPI as ProviderMiMoTTSAPI,
+ )
case "genie_tts":
from .sources.genie_tts import (
GenieTTSProvider as GenieTTSProvider,
diff --git a/astrbot/core/provider/sources/mimo_tts_api_source.py b/astrbot/core/provider/sources/mimo_tts_api_source.py
new file mode 100644
index 0000000000..a9240203dc
--- /dev/null
+++ b/astrbot/core/provider/sources/mimo_tts_api_source.py
@@ -0,0 +1,149 @@
+import base64
+import uuid
+from pathlib import Path
+
+import httpx
+
+from ..entities import ProviderType
+from ..provider import TTSProvider
+from ..register import register_provider_adapter
+from astrbot import logger
+from astrbot.core.utils.astrbot_path import get_astrbot_temp_path
+
+
+def normalize_timeout(timeout: int | str | None) -> int | None:
+ if timeout in (None, ""):
+ return None
+ if isinstance(timeout, str):
+ return int(timeout)
+ return timeout
+
+
+def build_headers(api_key: str) -> dict[str, str]:
+ headers = {"Content-Type": "application/json"}
+ if api_key:
+ headers["api-key"] = api_key
+ headers["Authorization"] = f"Bearer {api_key}"
+ return headers
+
+
+def get_temp_dir() -> Path:
+ temp_dir = Path(get_astrbot_temp_path())
+ temp_dir.mkdir(parents=True, exist_ok=True)
+ return temp_dir
+
+
+def create_http_client(timeout: int | None, proxy: str) -> httpx.AsyncClient:
+ client_kwargs: dict[str, object] = {
+ "timeout": timeout,
+ "follow_redirects": True,
+ }
+ if proxy:
+ logger.info("[MiMo API] Using proxy: %s", proxy)
+ client_kwargs["proxy"] = proxy
+ return httpx.AsyncClient(**client_kwargs)
+
+
+def build_api_url(api_base: str) -> str:
+ normalized_api_base = api_base.rstrip("/")
+ if normalized_api_base.endswith("/chat/completions"):
+ return normalized_api_base
+ return normalized_api_base + "/chat/completions"
+
+
+@register_provider_adapter(
+ "mimo_tts_api",
+ "MiMo TTS API",
+ provider_type=ProviderType.TEXT_TO_SPEECH,
+)
+class ProviderMiMoTTSAPI(TTSProvider):
+ def __init__(
+ self,
+ provider_config: dict,
+ provider_settings: dict,
+ ) -> None:
+ super().__init__(provider_config, provider_settings)
+ self.chosen_api_key = provider_config.get("api_key", "")
+ self.api_base = provider_config.get(
+ "api_base",
+ "https://api.xiaomimimo.com/v1",
+ )
+ self.proxy = provider_config.get("proxy", "")
+ self.timeout = normalize_timeout(provider_config.get("timeout", 20))
+ self.voice = provider_config.get("mimo-tts-voice", "mimo_default")
+ self.audio_format = provider_config.get("mimo-tts-format", "wav")
+ self.style_prompt = provider_config.get("mimo-tts-style-prompt", "")
+ self.dialect = provider_config.get("mimo-tts-dialect", "")
+ self.seed_text = provider_config.get(
+ "mimo-tts-seed-text",
+ "Hello, MiMo, have you had lunch?",
+ )
+ self.set_model(provider_config.get("model", "mimo-v2-tts"))
+ self.client = create_http_client(self.timeout, self.proxy)
+
+ def _build_user_prompt(self) -> str:
+ prompt_parts: list[str] = []
+
+ if self.style_prompt.strip():
+ prompt_parts.append(self.style_prompt.strip())
+ if self.dialect.strip():
+ prompt_parts.append(f"Please use {self.dialect.strip()} when speaking.")
+
+ if not prompt_parts:
+ return self.seed_text
+
+ if self.seed_text.strip():
+ prompt_parts.append(self.seed_text.strip())
+
+ return " ".join(prompt_parts)
+
+ def _build_payload(self, text: str) -> dict:
+ return {
+ "model": self.model_name,
+ "messages": [
+ {
+ "role": "user",
+ "content": self._build_user_prompt(),
+ },
+ {
+ "role": "assistant",
+ "content": text,
+ },
+ ],
+ "audio": {
+ "format": self.audio_format,
+ "voice": self.voice,
+ },
+ }
+
+ async def get_audio(self, text: str) -> str:
+ response = await self.client.post(
+ build_api_url(self.api_base),
+ headers=build_headers(self.chosen_api_key),
+ json=self._build_payload(text),
+ )
+
+ try:
+ response.raise_for_status()
+ except Exception as exc:
+ error_text = response.text[:1024]
+ raise Exception(
+ f"MiMo TTS API request failed: HTTP {response.status_code}, response: {error_text}"
+ ) from exc
+
+ data = response.json()
+ audio_data = (
+ data.get("choices", [{}])[0].get("message", {}).get("audio", {}).get("data")
+ )
+ if not audio_data:
+ raise Exception(f"MiMo TTS API returned no audio payload: {data}")
+
+ output_path = (
+ get_temp_dir() / f"mimo_tts_api_{uuid.uuid4()}.{self.audio_format}"
+ )
+ output_path.write_bytes(base64.b64decode(audio_data))
+ return str(output_path)
+
+ async def terminate(self):
+ if self.client:
+ await self.client.aclose()
diff --git a/dashboard/src/composables/useProviderSources.ts b/dashboard/src/composables/useProviderSources.ts
index 53959ca3e8..7ff12190e3 100644
--- a/dashboard/src/composables/useProviderSources.ts
+++ b/dashboard/src/composables/useProviderSources.ts
@@ -325,6 +325,7 @@ export function useProviderSources(options: UseProviderSourcesOptions) {
openai_whisper_selfhost: 'speech_to_text',
sensevoice_stt_selfhost: 'speech_to_text',
openai_tts_api: 'text_to_speech',
+ mimo_tts_api: 'text_to_speech',
edge_tts: 'text_to_speech',
gsvi_tts_api: 'text_to_speech',
fishaudio_tts_api: 'text_to_speech',
diff --git a/dashboard/src/i18n/locales/en-US/features/config-metadata.json b/dashboard/src/i18n/locales/en-US/features/config-metadata.json
index a51dfefbe4..005559a5a4 100644
--- a/dashboard/src/i18n/locales/en-US/features/config-metadata.json
+++ b/dashboard/src/i18n/locales/en-US/features/config-metadata.json
@@ -1391,6 +1391,26 @@
"description": "voice",
"hint": "OpenAI TTS voice. OpenAI defaults: 'alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'."
},
+ "mimo-tts-voice": {
+ "description": "Voice",
+ "hint": "MiMo TTS voice name. Default is 'mimo_default'."
+ },
+ "mimo-tts-format": {
+ "description": "Output format",
+ "hint": "Audio format generated by MiMo TTS, for example 'wav'."
+ },
+ "mimo-tts-style-prompt": {
+ "description": "Style prompt",
+ "hint": "Guides speaking style, tone, or emotion such as gentle, lively, or calm. Optional."
+ },
+ "mimo-tts-dialect": {
+ "description": "Dialect",
+ "hint": "Target dialect or accent for generated speech, such as Sichuan dialect. Optional."
+ },
+ "mimo-tts-seed-text": {
+ "description": "Seed text",
+ "hint": "Reference text used to guide voice characteristics and speaking style."
+ },
"fishaudio-tts-character": {
"description": "character",
"hint": "Fishaudio TTS character. Default is Klee. More roles: https://fish.audio/zh-CN/discovery"
@@ -1518,4 +1538,4 @@
"helpMiddle": "or",
"helpSuffix": "."
}
-}
\ No newline at end of file
+}
diff --git a/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json b/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json
index 56d12c9838..cf0e97acd8 100644
--- a/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json
+++ b/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json
@@ -1396,6 +1396,26 @@
"description": "API Base URL",
"hint": "Голоса OpenAI TTS: alloy, echo и др."
},
+ "mimo-tts-voice": {
+ "description": "Голос",
+ "hint": "Имя голоса MiMo TTS. По умолчанию 'mimo_default'."
+ },
+ "mimo-tts-format": {
+ "description": "Формат вывода",
+ "hint": "Формат аудио, создаваемого MiMo TTS, например 'wav'."
+ },
+ "mimo-tts-style-prompt": {
+ "description": "Подсказка стиля",
+ "hint": "Задает стиль речи, тон или эмоцию, например мягкий, живой или спокойный. Необязательно."
+ },
+ "mimo-tts-dialect": {
+ "description": "Диалект",
+ "hint": "Диалект или акцент для синтезируемой речи, например сычуаньский диалект. Необязательно."
+ },
+ "mimo-tts-seed-text": {
+ "description": "Начальный текст",
+ "hint": "Эталонный текст, который помогает задать особенности голоса и манеру речи."
+ },
"fishaudio-tts-character": {
"description": "Персонаж",
"hint": "Персонаж Fishaudio. По умолчанию Klee."
@@ -1523,4 +1543,4 @@
"helpMiddle": "или",
"helpSuffix": "."
}
-}
\ No newline at end of file
+}
diff --git a/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json b/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json
index 69c1746c6e..640bf27569 100644
--- a/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json
+++ b/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json
@@ -1393,6 +1393,26 @@
"description": "voice",
"hint": "OpenAI TTS 的声音。OpenAI 默认支持:'alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'"
},
+ "mimo-tts-voice": {
+ "description": "音色",
+ "hint": "MiMo TTS 的音色名称。默认值为 'mimo_default'。"
+ },
+ "mimo-tts-format": {
+ "description": "输出格式",
+ "hint": "MiMo TTS 生成音频的格式,例如 'wav'。"
+ },
+ "mimo-tts-style-prompt": {
+ "description": "风格提示词",
+ "hint": "用于控制生成语音的说话风格、语气或情绪,例如温柔、活泼、沉稳等。可留空。"
+ },
+ "mimo-tts-dialect": {
+ "description": "方言",
+ "hint": "指定生成语音时使用的方言或口音,例如四川话、粤语口音等。可留空。"
+ },
+ "mimo-tts-seed-text": {
+ "description": "种子文本",
+ "hint": "用于引导音色和说话方式的参考文本,会影响生成语音的表达风格。"
+ },
"fishaudio-tts-character": {
"description": "character",
"hint": "fishaudio TTS 的角色。默认为可莉。更多角色请访问:https://fish.audio/zh-CN/discovery"
@@ -1520,4 +1540,4 @@
"helpMiddle": "或",
"helpSuffix": "。"
}
-}
\ No newline at end of file
+}
diff --git a/dashboard/src/utils/providerUtils.js b/dashboard/src/utils/providerUtils.js
index 4bfe3ea6e9..273b921445 100644
--- a/dashboard/src/utils/providerUtils.js
+++ b/dashboard/src/utils/providerUtils.js
@@ -30,6 +30,7 @@ export function getProviderIcon(type) {
'lm_studio': 'https://cdn.jsdelivr.net/npm/@lobehub/icons-static-svg@latest/icons/lmstudio.svg',
'fishaudio': 'https://cdn.jsdelivr.net/npm/@lobehub/icons-static-svg@latest/icons/fishaudio.svg',
'minimax': 'https://cdn.jsdelivr.net/npm/@lobehub/icons-static-svg@latest/icons/minimax.svg',
+ 'mimo': 'https://platform.xiaomimimo.com/favicon.874c9507.png',
'302ai': 'https://cdn.jsdelivr.net/npm/@lobehub/icons-static-svg@1.53.0/icons/ai302-color.svg',
'microsoft': 'https://cdn.jsdelivr.net/npm/@lobehub/icons-static-svg@latest/icons/microsoft.svg',
'vllm': 'https://cdn.jsdelivr.net/npm/@lobehub/icons-static-svg@latest/icons/vllm.svg',
From a8236bae1a6e02cc9bd2509f48223b1d2b5b6877 Mon Sep 17 00:00:00 2001
From: Richard Liu <2645345468@qq.com>
Date: Thu, 19 Mar 2026 23:59:30 +0800
Subject: [PATCH 2/5] fix: handle empty mimo tts choices
---
astrbot/core/provider/sources/mimo_tts_api_source.py | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/astrbot/core/provider/sources/mimo_tts_api_source.py b/astrbot/core/provider/sources/mimo_tts_api_source.py
index a9240203dc..82d6a4ab8c 100644
--- a/astrbot/core/provider/sources/mimo_tts_api_source.py
+++ b/astrbot/core/provider/sources/mimo_tts_api_source.py
@@ -132,9 +132,10 @@ async def get_audio(self, text: str) -> str:
) from exc
data = response.json()
- audio_data = (
- data.get("choices", [{}])[0].get("message", {}).get("audio", {}).get("data")
- )
+ choices = data.get("choices") or []
+ first_choice = choices[0] if choices else {}
+ message = first_choice.get("message", {})
+ audio_data = message.get("audio", {}).get("data")
if not audio_data:
raise Exception(f"MiMo TTS API returned no audio payload: {data}")
From db30538fb3aa5babd6a849bfd3d3fc77f8d98f68 Mon Sep 17 00:00:00 2001
From: Richard Liu <2645345468@qq.com>
Date: Fri, 20 Mar 2026 00:48:43 +0800
Subject: [PATCH 3/5] feat: add mimo stt provider support
---
astrbot/core/config/default.py | 28 ++-
astrbot/core/provider/manager.py | 4 +
.../core/provider/sources/mimo_api_common.py | 129 ++++++++++++++
.../provider/sources/mimo_stt_api_source.py | 100 +++++++++++
.../provider/sources/mimo_tts_api_source.py | 73 ++------
.../src/composables/useProviderSources.ts | 1 +
.../en-US/features/config-metadata.json | 12 +-
.../ru-RU/features/config-metadata.json | 12 +-
.../zh-CN/features/config-metadata.json | 12 +-
tests/test_mimo_api_sources.py | 167 ++++++++++++++++++
10 files changed, 475 insertions(+), 63 deletions(-)
create mode 100644 astrbot/core/provider/sources/mimo_api_common.py
create mode 100644 astrbot/core/provider/sources/mimo_stt_api_source.py
create mode 100644 tests/test_mimo_api_sources.py
diff --git a/astrbot/core/config/default.py b/astrbot/core/config/default.py
index afba850193..a37e9eadf0 100644
--- a/astrbot/core/config/default.py
+++ b/astrbot/core/config/default.py
@@ -1431,6 +1431,20 @@ class ChatProviderTemplate(TypedDict):
"model": "whisper-1",
"proxy": "",
},
+ "MiMo STT(API)": {
+ "id": "mimo_stt",
+ "provider": "mimo",
+ "type": "mimo_stt_api",
+ "provider_type": "speech_to_text",
+ "enable": False,
+ "api_key": "",
+ "api_base": "https://api.xiaomimimo.com/v1",
+ "model": "mimo-v2-omni",
+ "mimo-stt-system-prompt": "You are a speech transcription assistant. Transcribe the spoken content from the audio exactly and return only the transcription text.",
+ "mimo-stt-user-prompt": "Please transcribe the content of the audio and return only the transcription text.",
+ "timeout": "20",
+ "proxy": "",
+ },
"Whisper(Local)": {
"provider": "openai",
"type": "openai_whisper_selfhost",
@@ -2326,6 +2340,16 @@ class ChatProviderTemplate(TypedDict):
"type": "int",
"hint": "超时时间,单位为秒。",
},
+ "mimo-stt-system-prompt": {
+ "description": "系统提示词",
+ "type": "string",
+ "hint": "用于指导 MiMo STT 转录行为的 system prompt。",
+ },
+ "mimo-stt-user-prompt": {
+ "description": "用户提示词",
+ "type": "string",
+ "hint": "附加给 MiMo STT 的用户提示词,用于约束返回结果格式。",
+ },
"openai-tts-voice": {
"description": "voice",
"type": "string",
@@ -2334,12 +2358,12 @@ class ChatProviderTemplate(TypedDict):
"mimo-tts-voice": {
"description": "音色",
"type": "string",
- "hint": "MiMo TTS 的音色名称。默认值为 'mimo_default'。",
+ "hint": "MiMo TTS 的音色名称。可选值包括 'mimo_default'、'default_en'、'default_zh'。",
},
"mimo-tts-format": {
"description": "输出格式",
"type": "string",
- "hint": "MiMo TTS 生成音频的格式,例如 'wav'。",
+ "hint": "MiMo TTS 生成音频的格式。支持 'wav'、'mp3'、'pcm'。",
},
"mimo-tts-style-prompt": {
"description": "风格提示词",
diff --git a/astrbot/core/provider/manager.py b/astrbot/core/provider/manager.py
index 62e1fcd80f..a294a51e17 100644
--- a/astrbot/core/provider/manager.py
+++ b/astrbot/core/provider/manager.py
@@ -387,6 +387,10 @@ def dynamic_import_provider(self, type: str) -> None:
from .sources.whisper_api_source import (
ProviderOpenAIWhisperAPI as ProviderOpenAIWhisperAPI,
)
+ case "mimo_stt_api":
+ from .sources.mimo_stt_api_source import (
+ ProviderMiMoSTTAPI as ProviderMiMoSTTAPI,
+ )
case "openai_whisper_selfhost":
from .sources.whisper_selfhosted_source import (
ProviderOpenAIWhisperSelfHost as ProviderOpenAIWhisperSelfHost,
diff --git a/astrbot/core/provider/sources/mimo_api_common.py b/astrbot/core/provider/sources/mimo_api_common.py
new file mode 100644
index 0000000000..d3bf75e66d
--- /dev/null
+++ b/astrbot/core/provider/sources/mimo_api_common.py
@@ -0,0 +1,129 @@
+import base64
+import uuid
+from pathlib import Path
+from urllib.parse import urlparse
+
+import httpx
+
+from astrbot import logger
+from astrbot.core.utils.astrbot_path import get_astrbot_temp_path
+from astrbot.core.utils.io import download_file
+from astrbot.core.utils.tencent_record_helper import (
+ convert_to_pcm_wav,
+ tencent_silk_to_wav,
+)
+
+DEFAULT_MIMO_API_BASE = "https://api.xiaomimimo.com/v1"
+DEFAULT_MIMO_TTS_MODEL = "mimo-v2-tts"
+DEFAULT_MIMO_TTS_VOICE = "mimo_default"
+DEFAULT_MIMO_TTS_SEED_TEXT = "Hello, MiMo, have you had lunch?"
+DEFAULT_MIMO_STT_MODEL = "mimo-v2-omni"
+DEFAULT_MIMO_STT_SYSTEM_PROMPT = (
+ "You are a speech transcription assistant. "
+ "Transcribe the spoken content from the audio exactly and return only the transcription text."
+)
+DEFAULT_MIMO_STT_USER_PROMPT = (
+ "Please transcribe the content of the audio and return only the transcription text."
+)
+
+
+class MiMoAPIError(Exception):
+ pass
+
+
+def normalize_timeout(timeout: int | str | None) -> int | None:
+ if timeout in (None, ""):
+ return None
+ if isinstance(timeout, str):
+ return int(timeout)
+ return timeout
+
+
+def build_headers(api_key: str) -> dict[str, str]:
+ headers = {"Content-Type": "application/json"}
+ if api_key:
+ headers["Authorization"] = f"Bearer {api_key}"
+ return headers
+
+
+def get_temp_dir() -> Path:
+ temp_dir = Path(get_astrbot_temp_path())
+ temp_dir.mkdir(parents=True, exist_ok=True)
+ return temp_dir
+
+
+def create_http_client(timeout: int | None, proxy: str) -> httpx.AsyncClient:
+ client_kwargs: dict[str, object] = {
+ "timeout": timeout,
+ "follow_redirects": True,
+ }
+ if proxy:
+ logger.info("[MiMo API] Using proxy: %s", proxy)
+ client_kwargs["proxy"] = proxy
+ return httpx.AsyncClient(**client_kwargs)
+
+
+def build_api_url(api_base: str) -> str:
+ normalized_api_base = api_base.rstrip("/")
+ if normalized_api_base.endswith("/chat/completions"):
+ return normalized_api_base
+ return normalized_api_base + "/chat/completions"
+
+
+async def _detect_audio_format(file_path: Path) -> str | None:
+ silk_header = b"SILK"
+ amr_header = b"#!AMR"
+
+ try:
+ with file_path.open("rb") as file:
+ file_header = file.read(8)
+ except FileNotFoundError:
+ return None
+
+ if silk_header in file_header:
+ return "silk"
+ if amr_header in file_header:
+ return "amr"
+ return None
+
+
+async def prepare_audio_input(audio_source: str) -> tuple[str, list[Path]]:
+ cleanup_paths: list[Path] = []
+ source_path = Path(audio_source)
+ is_remote = audio_source.startswith(("http://", "https://"))
+ is_tencent = "multimedia.nt.qq.com.cn" in audio_source if is_remote else False
+
+ if is_remote:
+ parsed_url = urlparse(audio_source)
+ suffix = Path(parsed_url.path).suffix or ".input"
+ download_path = get_temp_dir() / f"mimo_audio_{uuid.uuid4().hex[:8]}{suffix}"
+ await download_file(audio_source, str(download_path))
+ source_path = download_path
+ cleanup_paths.append(download_path)
+
+ if not source_path.exists():
+ raise FileNotFoundError(f"File does not exist: {source_path}")
+
+ if source_path.suffix.lower() in {".amr", ".silk"} or is_tencent:
+ file_format = await _detect_audio_format(source_path)
+ if file_format in {"silk", "amr"}:
+ converted_path = get_temp_dir() / f"mimo_audio_{uuid.uuid4().hex[:8]}.wav"
+ cleanup_paths.append(converted_path)
+ if file_format == "silk":
+ logger.info("Converting silk file to wav for MiMo STT...")
+ await tencent_silk_to_wav(str(source_path), str(converted_path))
+ else:
+ logger.info("Converting amr file to wav for MiMo STT...")
+ await convert_to_pcm_wav(str(source_path), str(converted_path))
+ source_path = converted_path
+
+ encoded_audio = base64.b64encode(source_path.read_bytes()).decode("utf-8")
+ return encoded_audio, cleanup_paths
+
+
+def cleanup_files(paths: list[Path]) -> None:
+ for path in paths:
+ try:
+ path.unlink(missing_ok=True)
+ except Exception as exc:
+ logger.warning("Failed to remove temporary MiMo file %s: %s", path, exc)
diff --git a/astrbot/core/provider/sources/mimo_stt_api_source.py b/astrbot/core/provider/sources/mimo_stt_api_source.py
new file mode 100644
index 0000000000..9b03e2efc6
--- /dev/null
+++ b/astrbot/core/provider/sources/mimo_stt_api_source.py
@@ -0,0 +1,100 @@
+from ..entities import ProviderType
+from ..provider import STTProvider
+from ..register import register_provider_adapter
+from .mimo_api_common import (
+ DEFAULT_MIMO_API_BASE,
+ DEFAULT_MIMO_STT_MODEL,
+ DEFAULT_MIMO_STT_SYSTEM_PROMPT,
+ DEFAULT_MIMO_STT_USER_PROMPT,
+ MiMoAPIError,
+ build_api_url,
+ build_headers,
+ cleanup_files,
+ create_http_client,
+ normalize_timeout,
+ prepare_audio_input,
+)
+
+
+@register_provider_adapter(
+ "mimo_stt_api",
+ "MiMo STT API",
+ provider_type=ProviderType.SPEECH_TO_TEXT,
+)
+class ProviderMiMoSTTAPI(STTProvider):
+ def __init__(
+ self,
+ provider_config: dict,
+ provider_settings: dict,
+ ) -> None:
+ super().__init__(provider_config, provider_settings)
+ self.chosen_api_key = provider_config.get("api_key", "")
+ self.api_base = provider_config.get("api_base", DEFAULT_MIMO_API_BASE)
+ self.proxy = provider_config.get("proxy", "")
+ self.timeout = normalize_timeout(provider_config.get("timeout", 20))
+ self.system_prompt = provider_config.get(
+ "mimo-stt-system-prompt",
+ DEFAULT_MIMO_STT_SYSTEM_PROMPT,
+ )
+ self.user_prompt = provider_config.get(
+ "mimo-stt-user-prompt",
+ DEFAULT_MIMO_STT_USER_PROMPT,
+ )
+ self.set_model(provider_config.get("model", DEFAULT_MIMO_STT_MODEL))
+ self.client = create_http_client(self.timeout, self.proxy)
+
+ async def get_text(self, audio_url: str) -> str:
+ audio_data_url, cleanup_paths = await prepare_audio_input(audio_url)
+ payload = {
+ "model": self.model_name,
+ "messages": [
+ {
+ "role": "system",
+ "content": self.system_prompt,
+ },
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "input_audio",
+ "input_audio": {
+ "data": audio_data_url,
+ },
+ },
+ {
+ "type": "text",
+ "text": self.user_prompt,
+ },
+ ],
+ },
+ ],
+ "max_completion_tokens": 1024,
+ }
+
+ try:
+ response = await self.client.post(
+ build_api_url(self.api_base),
+ headers=build_headers(self.chosen_api_key),
+ json=payload,
+ )
+ try:
+ response.raise_for_status()
+ except Exception as exc:
+ error_text = response.text[:1024]
+ raise MiMoAPIError(
+ f"MiMo STT API request failed: HTTP {response.status_code}, response: {error_text}"
+ ) from exc
+
+ data = response.json()
+ choices = data.get("choices") or []
+ first_choice = choices[0] if choices else {}
+ content = first_choice.get("message", {}).get("content", "")
+ if not isinstance(content, str) or not content.strip():
+ raise MiMoAPIError(f"MiMo STT API returned empty transcription: {data}")
+ return content.strip()
+ finally:
+ cleanup_files(cleanup_paths)
+
+ async def terminate(self):
+ if self.client:
+ await self.client.aclose()
diff --git a/astrbot/core/provider/sources/mimo_tts_api_source.py b/astrbot/core/provider/sources/mimo_tts_api_source.py
index 82d6a4ab8c..daad55e1e6 100644
--- a/astrbot/core/provider/sources/mimo_tts_api_source.py
+++ b/astrbot/core/provider/sources/mimo_tts_api_source.py
@@ -1,54 +1,21 @@
import base64
import uuid
-from pathlib import Path
-
-import httpx
from ..entities import ProviderType
from ..provider import TTSProvider
from ..register import register_provider_adapter
-from astrbot import logger
-from astrbot.core.utils.astrbot_path import get_astrbot_temp_path
-
-
-def normalize_timeout(timeout: int | str | None) -> int | None:
- if timeout in (None, ""):
- return None
- if isinstance(timeout, str):
- return int(timeout)
- return timeout
-
-
-def build_headers(api_key: str) -> dict[str, str]:
- headers = {"Content-Type": "application/json"}
- if api_key:
- headers["api-key"] = api_key
- headers["Authorization"] = f"Bearer {api_key}"
- return headers
-
-
-def get_temp_dir() -> Path:
- temp_dir = Path(get_astrbot_temp_path())
- temp_dir.mkdir(parents=True, exist_ok=True)
- return temp_dir
-
-
-def create_http_client(timeout: int | None, proxy: str) -> httpx.AsyncClient:
- client_kwargs: dict[str, object] = {
- "timeout": timeout,
- "follow_redirects": True,
- }
- if proxy:
- logger.info("[MiMo API] Using proxy: %s", proxy)
- client_kwargs["proxy"] = proxy
- return httpx.AsyncClient(**client_kwargs)
-
-
-def build_api_url(api_base: str) -> str:
- normalized_api_base = api_base.rstrip("/")
- if normalized_api_base.endswith("/chat/completions"):
- return normalized_api_base
- return normalized_api_base + "/chat/completions"
+from .mimo_api_common import (
+ DEFAULT_MIMO_API_BASE,
+ DEFAULT_MIMO_TTS_MODEL,
+ DEFAULT_MIMO_TTS_SEED_TEXT,
+ DEFAULT_MIMO_TTS_VOICE,
+ MiMoAPIError,
+ build_api_url,
+ build_headers,
+ create_http_client,
+ get_temp_dir,
+ normalize_timeout,
+)
@register_provider_adapter(
@@ -64,21 +31,17 @@ def __init__(
) -> None:
super().__init__(provider_config, provider_settings)
self.chosen_api_key = provider_config.get("api_key", "")
- self.api_base = provider_config.get(
- "api_base",
- "https://api.xiaomimimo.com/v1",
- )
+ self.api_base = provider_config.get("api_base", DEFAULT_MIMO_API_BASE)
self.proxy = provider_config.get("proxy", "")
self.timeout = normalize_timeout(provider_config.get("timeout", 20))
- self.voice = provider_config.get("mimo-tts-voice", "mimo_default")
+ self.voice = provider_config.get("mimo-tts-voice", DEFAULT_MIMO_TTS_VOICE)
self.audio_format = provider_config.get("mimo-tts-format", "wav")
self.style_prompt = provider_config.get("mimo-tts-style-prompt", "")
self.dialect = provider_config.get("mimo-tts-dialect", "")
self.seed_text = provider_config.get(
- "mimo-tts-seed-text",
- "Hello, MiMo, have you had lunch?",
+ "mimo-tts-seed-text", DEFAULT_MIMO_TTS_SEED_TEXT
)
- self.set_model(provider_config.get("model", "mimo-v2-tts"))
+ self.set_model(provider_config.get("model", DEFAULT_MIMO_TTS_MODEL))
self.client = create_http_client(self.timeout, self.proxy)
def _build_user_prompt(self) -> str:
@@ -127,7 +90,7 @@ async def get_audio(self, text: str) -> str:
response.raise_for_status()
except Exception as exc:
error_text = response.text[:1024]
- raise Exception(
+ raise MiMoAPIError(
f"MiMo TTS API request failed: HTTP {response.status_code}, response: {error_text}"
) from exc
@@ -137,7 +100,7 @@ async def get_audio(self, text: str) -> str:
message = first_choice.get("message", {})
audio_data = message.get("audio", {}).get("data")
if not audio_data:
- raise Exception(f"MiMo TTS API returned no audio payload: {data}")
+ raise MiMoAPIError(f"MiMo TTS API returned no audio payload: {data}")
output_path = (
get_temp_dir() / f"mimo_tts_api_{uuid.uuid4()}.{self.audio_format}"
diff --git a/dashboard/src/composables/useProviderSources.ts b/dashboard/src/composables/useProviderSources.ts
index 7ff12190e3..e646d59cc0 100644
--- a/dashboard/src/composables/useProviderSources.ts
+++ b/dashboard/src/composables/useProviderSources.ts
@@ -322,6 +322,7 @@ export function useProviderSources(options: UseProviderSourcesOptions) {
coze: 'agent_runner',
dashscope: 'chat_completion',
openai_whisper_api: 'speech_to_text',
+ mimo_stt_api: 'speech_to_text',
openai_whisper_selfhost: 'speech_to_text',
sensevoice_stt_selfhost: 'speech_to_text',
openai_tts_api: 'text_to_speech',
diff --git a/dashboard/src/i18n/locales/en-US/features/config-metadata.json b/dashboard/src/i18n/locales/en-US/features/config-metadata.json
index 005559a5a4..12b5af7f26 100644
--- a/dashboard/src/i18n/locales/en-US/features/config-metadata.json
+++ b/dashboard/src/i18n/locales/en-US/features/config-metadata.json
@@ -1387,17 +1387,25 @@
"description": "Timeout",
"hint": "Timeout in seconds."
},
+ "mimo-stt-system-prompt": {
+ "description": "System prompt",
+ "hint": "System prompt used to guide MiMo STT transcription behavior."
+ },
+ "mimo-stt-user-prompt": {
+ "description": "User prompt",
+ "hint": "Additional user prompt sent to MiMo STT to constrain the returned transcription format."
+ },
"openai-tts-voice": {
"description": "voice",
"hint": "OpenAI TTS voice. OpenAI defaults: 'alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'."
},
"mimo-tts-voice": {
"description": "Voice",
- "hint": "MiMo TTS voice name. Default is 'mimo_default'."
+ "hint": "MiMo TTS voice name. Supported values include 'mimo_default', 'default_en', and 'default_zh'."
},
"mimo-tts-format": {
"description": "Output format",
- "hint": "Audio format generated by MiMo TTS, for example 'wav'."
+ "hint": "Audio format generated by MiMo TTS. Supported values: 'wav', 'mp3', and 'pcm'."
},
"mimo-tts-style-prompt": {
"description": "Style prompt",
diff --git a/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json b/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json
index cf0e97acd8..6424a3f705 100644
--- a/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json
+++ b/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json
@@ -1392,17 +1392,25 @@
"description": "Таймаут (сек)",
"hint": "Максимальное время ожидания ответа."
},
+ "mimo-stt-system-prompt": {
+ "description": "Системный промпт",
+ "hint": "System prompt, который управляет поведением MiMo STT при распознавании."
+ },
+ "mimo-stt-user-prompt": {
+ "description": "Пользовательский промпт",
+ "hint": "Дополнительный user prompt для MiMo STT, который помогает задать формат результата."
+ },
"openai-tts-voice": {
"description": "API Base URL",
"hint": "Голоса OpenAI TTS: alloy, echo и др."
},
"mimo-tts-voice": {
"description": "Голос",
- "hint": "Имя голоса MiMo TTS. По умолчанию 'mimo_default'."
+ "hint": "Имя голоса MiMo TTS. Поддерживаются значения 'mimo_default', 'default_en' и 'default_zh'."
},
"mimo-tts-format": {
"description": "Формат вывода",
- "hint": "Формат аудио, создаваемого MiMo TTS, например 'wav'."
+ "hint": "Формат аудио, создаваемого MiMo TTS. Поддерживаются 'wav', 'mp3' и 'pcm'."
},
"mimo-tts-style-prompt": {
"description": "Подсказка стиля",
diff --git a/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json b/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json
index 640bf27569..bde8dab7e7 100644
--- a/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json
+++ b/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json
@@ -1389,17 +1389,25 @@
"description": "超时时间",
"hint": "超时时间,单位为秒。"
},
+ "mimo-stt-system-prompt": {
+ "description": "系统提示词",
+ "hint": "用于指导 MiMo STT 转录行为的 system prompt。"
+ },
+ "mimo-stt-user-prompt": {
+ "description": "用户提示词",
+ "hint": "附加给 MiMo STT 的用户提示词,用于约束返回结果格式。"
+ },
"openai-tts-voice": {
"description": "voice",
"hint": "OpenAI TTS 的声音。OpenAI 默认支持:'alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'"
},
"mimo-tts-voice": {
"description": "音色",
- "hint": "MiMo TTS 的音色名称。默认值为 'mimo_default'。"
+ "hint": "MiMo TTS 的音色名称。可选值包括 'mimo_default'、'default_en'、'default_zh'。"
},
"mimo-tts-format": {
"description": "输出格式",
- "hint": "MiMo TTS 生成音频的格式,例如 'wav'。"
+ "hint": "MiMo TTS 生成音频的格式。支持 'wav'、'mp3'、'pcm'。"
},
"mimo-tts-style-prompt": {
"description": "风格提示词",
diff --git a/tests/test_mimo_api_sources.py b/tests/test_mimo_api_sources.py
new file mode 100644
index 0000000000..d262f1c275
--- /dev/null
+++ b/tests/test_mimo_api_sources.py
@@ -0,0 +1,167 @@
+import asyncio
+from types import SimpleNamespace
+
+import pytest
+
+from astrbot.core.provider.sources.mimo_api_common import MiMoAPIError, build_headers
+from astrbot.core.provider.sources.mimo_stt_api_source import ProviderMiMoSTTAPI
+from astrbot.core.provider.sources.mimo_tts_api_source import ProviderMiMoTTSAPI
+
+
+def _make_tts_provider(overrides: dict | None = None) -> ProviderMiMoTTSAPI:
+ provider_config = {
+ "id": "test-mimo-tts",
+ "type": "mimo_tts_api",
+ "model": "mimo-v2-tts",
+ "api_key": "test-key",
+ "mimo-tts-voice": "mimo_default",
+ "mimo-tts-format": "wav",
+ "mimo-tts-seed-text": "seed text",
+ }
+ if overrides:
+ provider_config.update(overrides)
+ return ProviderMiMoTTSAPI(provider_config=provider_config, provider_settings={})
+
+
+def _make_stt_provider(overrides: dict | None = None) -> ProviderMiMoSTTAPI:
+ provider_config = {
+ "id": "test-mimo-stt",
+ "type": "mimo_stt_api",
+ "model": "mimo-v2-omni",
+ "api_key": "test-key",
+ }
+ if overrides:
+ provider_config.update(overrides)
+ return ProviderMiMoSTTAPI(provider_config=provider_config, provider_settings={})
+
+
+def test_mimo_tts_prompt_returns_seed_text_when_no_style_or_dialect():
+ provider = _make_tts_provider()
+ try:
+ assert provider._build_user_prompt() == "seed text"
+ finally:
+ asyncio.run(provider.terminate())
+
+
+def test_mimo_tts_payload_includes_dialect_and_style_prompt():
+ provider = _make_tts_provider(
+ {
+ "mimo-tts-style-prompt": "Please sound cheerful and lively.",
+ "mimo-tts-dialect": "Sichuan dialect",
+ "mimo-tts-seed-text": "You are chatting with a close friend.",
+ }
+ )
+ try:
+ payload = provider._build_payload("hello")
+ assert payload["messages"][0]["content"] == (
+ "Please sound cheerful and lively. "
+ "Please use Sichuan dialect when speaking. "
+ "You are chatting with a close friend."
+ )
+ finally:
+ asyncio.run(provider.terminate())
+
+
+def test_mimo_headers_use_single_authorization_method():
+ assert build_headers("test-key") == {
+ "Content-Type": "application/json",
+ "Authorization": "Bearer test-key",
+ }
+
+
+@pytest.mark.asyncio
+async def test_mimo_tts_get_audio_handles_empty_choices():
+ provider = _make_tts_provider()
+
+ class _Response:
+ status_code = 200
+ text = '{"choices":[]}'
+
+ def raise_for_status(self):
+ return None
+
+ def json(self):
+ return {"choices": []}
+
+ provider.client = SimpleNamespace(post=_fake_post(_Response()))
+
+ with pytest.raises(MiMoAPIError, match="returned no audio payload"):
+ await provider.get_audio("hello")
+
+
+@pytest.mark.asyncio
+async def test_mimo_stt_payload_includes_audio_and_prompt(monkeypatch):
+ provider = _make_stt_provider(
+ {
+ "mimo-stt-system-prompt": "system prompt",
+ "mimo-stt-user-prompt": "user prompt",
+ }
+ )
+
+ captured: dict = {}
+
+ async def fake_prepare_audio_input(_audio_source: str):
+ return "ZmFrZQ==", []
+
+ class _Response:
+ status_code = 200
+ text = '{"choices":[{"message":{"content":"transcribed text"}}]}'
+
+ def raise_for_status(self):
+ return None
+
+ def json(self):
+ return {"choices": [{"message": {"content": "transcribed text"}}]}
+
+ async def fake_post(_url, headers=None, json=None):
+ captured["headers"] = headers
+ captured["json"] = json
+ return _Response()
+
+ monkeypatch.setattr(
+ "astrbot.core.provider.sources.mimo_stt_api_source.prepare_audio_input",
+ fake_prepare_audio_input,
+ )
+ provider.client = SimpleNamespace(post=fake_post)
+
+ result = await provider.get_text("/tmp/test.wav")
+
+ assert result == "transcribed text"
+ assert captured["json"]["messages"][0]["content"] == "system prompt"
+ assert captured["json"]["messages"][1]["content"][0]["type"] == "input_audio"
+ assert captured["json"]["messages"][1]["content"][0]["input_audio"]["data"] == "ZmFrZQ=="
+ assert captured["json"]["messages"][1]["content"][1]["text"] == "user prompt"
+
+
+@pytest.mark.asyncio
+async def test_mimo_stt_get_text_handles_empty_choices(monkeypatch):
+ provider = _make_stt_provider()
+
+ async def fake_prepare_audio_input(_audio_source: str):
+ return "ZmFrZQ==", []
+
+ class _Response:
+ status_code = 200
+ text = '{"choices":[]}'
+
+ def raise_for_status(self):
+ return None
+
+ def json(self):
+ return {"choices": []}
+
+ monkeypatch.setattr(
+ "astrbot.core.provider.sources.mimo_stt_api_source.prepare_audio_input",
+ fake_prepare_audio_input,
+ )
+ provider.client = SimpleNamespace(post=_fake_post(_Response()))
+
+ with pytest.raises(MiMoAPIError, match="returned empty transcription"):
+ await provider.get_text("/tmp/test.wav")
+
+
+def _fake_post(response):
+ async def _post(*_args, **_kwargs):
+ return response
+
+ return _post
From c395ff7b57664154778acc3bda327262686e559a Mon Sep 17 00:00:00 2001
From: Richard Liu <2645345468@qq.com>
Date: Mon, 23 Mar 2026 00:22:49 +0800
Subject: [PATCH 4/5] fix: align mimo tts style payload with official docs
---
astrbot/core/config/default.py | 6 +-
.../provider/sources/mimo_tts_api_source.py | 56 ++++++++----
.../en-US/features/config-metadata.json | 6 +-
.../ru-RU/features/config-metadata.json | 6 +-
.../zh-CN/features/config-metadata.json | 6 +-
tests/test_mimo_api_sources.py | 90 ++++++++++++++++---
6 files changed, 129 insertions(+), 41 deletions(-)
diff --git a/astrbot/core/config/default.py b/astrbot/core/config/default.py
index a37e9eadf0..68d8050a8d 100644
--- a/astrbot/core/config/default.py
+++ b/astrbot/core/config/default.py
@@ -2368,17 +2368,17 @@ class ChatProviderTemplate(TypedDict):
"mimo-tts-style-prompt": {
"description": "风格提示词",
"type": "string",
- "hint": "用于控制生成语音的说话风格、语气或情绪,例如温柔、活泼、沉稳等。可留空。",
+ "hint": "会以 标签形式添加到待合成文本开头,用于控制语速、情绪、角色或风格,例如 开心、变快、孙悟空、悄悄话。可留空。",
},
"mimo-tts-dialect": {
"description": "方言",
"type": "string",
- "hint": "指定生成语音时使用的方言或口音,例如四川话、粤语口音等。可留空。",
+ "hint": "会与风格提示词一起写入开头的 标签中,例如 东北话、四川话、河南话、粤语。可留空。",
},
"mimo-tts-seed-text": {
"description": "种子文本",
"type": "string",
- "hint": "用于引导音色和说话方式的参考文本,会影响生成语音的表达风格。",
+ "hint": "作为可选的 user 消息发送,用于辅助调节语气和风格,不会拼接到待合成文本中。",
},
"fishaudio-tts-character": {
"description": "character",
diff --git a/astrbot/core/provider/sources/mimo_tts_api_source.py b/astrbot/core/provider/sources/mimo_tts_api_source.py
index daad55e1e6..2966bfb7d8 100644
--- a/astrbot/core/provider/sources/mimo_tts_api_source.py
+++ b/astrbot/core/provider/sources/mimo_tts_api_source.py
@@ -44,35 +44,53 @@ def __init__(
self.set_model(provider_config.get("model", DEFAULT_MIMO_TTS_MODEL))
self.client = create_http_client(self.timeout, self.proxy)
- def _build_user_prompt(self) -> str:
- prompt_parts: list[str] = []
+ def _build_user_prompt(self) -> str | None:
+ seed_text = self.seed_text.strip()
+ return seed_text or None
+
+ def _build_style_prefix(self) -> str:
+ style_parts: list[str] = []
if self.style_prompt.strip():
- prompt_parts.append(self.style_prompt.strip())
+ style_parts.append(self.style_prompt.strip())
if self.dialect.strip():
- prompt_parts.append(f"Please use {self.dialect.strip()} when speaking.")
+ style_parts.append(self.dialect.strip())
+
+ style_content = " ".join(style_parts).strip()
+ if not style_content:
+ return ""
- if not prompt_parts:
- return self.seed_text
+ # MiMo recommends using only the singing style tag at the very beginning.
+ if "唱歌" in style_content:
+ return ""
- if self.seed_text.strip():
- prompt_parts.append(self.seed_text.strip())
+ return f""
- return " ".join(prompt_parts)
+ def _build_assistant_content(self, text: str) -> str:
+ return f"{self._build_style_prefix()}{text}"
def _build_payload(self, text: str) -> dict:
- return {
- "model": self.model_name,
- "messages": [
+ messages: list[dict[str, str]] = []
+
+ user_prompt = self._build_user_prompt()
+ if user_prompt:
+ messages.append(
{
"role": "user",
- "content": self._build_user_prompt(),
- },
- {
- "role": "assistant",
- "content": text,
- },
- ],
+ "content": user_prompt,
+ }
+ )
+
+ messages.append(
+ {
+ "role": "assistant",
+ "content": self._build_assistant_content(text),
+ }
+ )
+
+ return {
+ "model": self.model_name,
+ "messages": messages,
"audio": {
"format": self.audio_format,
"voice": self.voice,
diff --git a/dashboard/src/i18n/locales/en-US/features/config-metadata.json b/dashboard/src/i18n/locales/en-US/features/config-metadata.json
index 12b5af7f26..f78ad90e40 100644
--- a/dashboard/src/i18n/locales/en-US/features/config-metadata.json
+++ b/dashboard/src/i18n/locales/en-US/features/config-metadata.json
@@ -1409,15 +1409,15 @@
},
"mimo-tts-style-prompt": {
"description": "Style prompt",
- "hint": "Guides speaking style, tone, or emotion such as gentle, lively, or calm. Optional."
+ "hint": "Prepended to the synthesis target text as a tag to control speed, emotion, character, or style, such as happy, faster, Sun Wukong, or whispering. Optional."
},
"mimo-tts-dialect": {
"description": "Dialect",
- "hint": "Target dialect or accent for generated speech, such as Sichuan dialect. Optional."
+ "hint": "Combined with the style prompt inside the leading tag, for example Northeastern Mandarin, Sichuan dialect, Henan dialect, or Cantonese. Optional."
},
"mimo-tts-seed-text": {
"description": "Seed text",
- "hint": "Reference text used to guide voice characteristics and speaking style."
+ "hint": "Sent as an optional user message to help guide tone and speaking style. It is not appended to the synthesis target text."
},
"fishaudio-tts-character": {
"description": "character",
diff --git a/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json b/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json
index 6424a3f705..e8353d95a3 100644
--- a/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json
+++ b/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json
@@ -1414,15 +1414,15 @@
},
"mimo-tts-style-prompt": {
"description": "Подсказка стиля",
- "hint": "Задает стиль речи, тон или эмоцию, например мягкий, живой или спокойный. Необязательно."
+ "hint": "Добавляется в начало синтезируемого текста в виде тега и управляет скоростью, эмоцией, ролью или манерой речи. Необязательно."
},
"mimo-tts-dialect": {
"description": "Диалект",
- "hint": "Диалект или акцент для синтезируемой речи, например сычуаньский диалект. Необязательно."
+ "hint": "Объединяется с подсказкой стиля внутри начального тега , например северо-восточный, сычуаньский, хэнаньский или кантонский вариант речи. Необязательно."
},
"mimo-tts-seed-text": {
"description": "Начальный текст",
- "hint": "Эталонный текст, который помогает задать особенности голоса и манеру речи."
+ "hint": "Отправляется как необязательное user-сообщение для настройки тона и манеры речи. Не добавляется к самому тексту синтеза."
},
"fishaudio-tts-character": {
"description": "Персонаж",
diff --git a/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json b/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json
index bde8dab7e7..c18ac23be2 100644
--- a/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json
+++ b/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json
@@ -1411,15 +1411,15 @@
},
"mimo-tts-style-prompt": {
"description": "风格提示词",
- "hint": "用于控制生成语音的说话风格、语气或情绪,例如温柔、活泼、沉稳等。可留空。"
+ "hint": "会以 标签形式添加到待合成文本开头,用于控制语速、情绪、角色或风格,例如 开心、变快、孙悟空、悄悄话。可留空。"
},
"mimo-tts-dialect": {
"description": "方言",
- "hint": "指定生成语音时使用的方言或口音,例如四川话、粤语口音等。可留空。"
+ "hint": "会与风格提示词一起写入开头的 标签中,例如 东北话、四川话、河南话、粤语。可留空。"
},
"mimo-tts-seed-text": {
"description": "种子文本",
- "hint": "用于引导音色和说话方式的参考文本,会影响生成语音的表达风格。"
+ "hint": "作为可选的 user 消息发送,用于辅助调节语气和风格,不会拼接到待合成文本中。"
},
"fishaudio-tts-character": {
"description": "character",
diff --git a/tests/test_mimo_api_sources.py b/tests/test_mimo_api_sources.py
index d262f1c275..c2b02aa136 100644
--- a/tests/test_mimo_api_sources.py
+++ b/tests/test_mimo_api_sources.py
@@ -35,7 +35,7 @@ def _make_stt_provider(overrides: dict | None = None) -> ProviderMiMoSTTAPI:
return ProviderMiMoSTTAPI(provider_config=provider_config, provider_settings={})
-def test_mimo_tts_prompt_returns_seed_text_when_no_style_or_dialect():
+def test_mimo_tts_user_prompt_returns_seed_text():
provider = _make_tts_provider()
try:
assert provider._build_user_prompt() == "seed text"
@@ -43,21 +43,88 @@ def test_mimo_tts_prompt_returns_seed_text_when_no_style_or_dialect():
asyncio.run(provider.terminate())
-def test_mimo_tts_payload_includes_dialect_and_style_prompt():
+def test_mimo_tts_assistant_content_prefixes_style_and_dialect():
provider = _make_tts_provider(
{
- "mimo-tts-style-prompt": "Please sound cheerful and lively.",
- "mimo-tts-dialect": "Sichuan dialect",
+ "mimo-tts-style-prompt": "开心",
+ "mimo-tts-dialect": "四川话",
"mimo-tts-seed-text": "You are chatting with a close friend.",
}
)
try:
payload = provider._build_payload("hello")
- assert payload["messages"][0]["content"] == (
- "Please sound cheerful and lively. "
- "Please use Sichuan dialect when speaking. "
- "You are chatting with a close friend."
- )
+ assert payload["messages"][0] == {
+ "role": "user",
+ "content": "You are chatting with a close friend.",
+ }
+ assert payload["messages"][1]["content"] == "hello"
+ finally:
+ asyncio.run(provider.terminate())
+
+
+def test_mimo_tts_payload_omits_user_message_without_seed_text():
+ provider = _make_tts_provider(
+ {
+ "mimo-tts-seed-text": "",
+ "mimo-tts-style-prompt": "开心",
+ }
+ )
+ try:
+ payload = provider._build_payload("hello")
+ assert payload["messages"] == [
+ {
+ "role": "assistant",
+ "content": "hello",
+ }
+ ]
+ finally:
+ asyncio.run(provider.terminate())
+
+
+def test_mimo_tts_singing_style_uses_single_style_tag():
+ provider = _make_tts_provider(
+ {
+ "mimo-tts-style-prompt": "唱歌 开心",
+ "mimo-tts-dialect": "粤语",
+ }
+ )
+ try:
+ payload = provider._build_payload("歌词")
+ assert payload["messages"][1]["content"] == "歌词"
+ finally:
+ asyncio.run(provider.terminate())
+
+
+def test_mimo_tts_plain_text_stays_in_assistant_message_when_no_style():
+ provider = _make_tts_provider(
+ {
+ "mimo-tts-seed-text": "",
+ }
+ )
+ try:
+ payload = provider._build_payload("hello")
+ assert payload["messages"] == [
+ {
+ "role": "assistant",
+ "content": "hello",
+ }
+ ]
+ finally:
+ asyncio.run(provider.terminate())
+
+
+def test_mimo_tts_seed_text_is_not_prepended_to_assistant_content():
+ provider = _make_tts_provider(
+ {
+ "mimo-tts-style-prompt": "开心",
+ "mimo-tts-seed-text": "reference text",
+ }
+ )
+ try:
+ payload = provider._build_payload("明天就是周五了")
+ assert payload["messages"][0]["content"] == "reference text"
+ assert payload["messages"][1]["content"] == "明天就是周五了"
+ assert "reference text" not in payload["messages"][1]["content"]
finally:
asyncio.run(provider.terminate())
@@ -129,7 +196,10 @@ async def fake_post(_url, headers=None, json=None):
assert result == "transcribed text"
assert captured["json"]["messages"][0]["content"] == "system prompt"
assert captured["json"]["messages"][1]["content"][0]["type"] == "input_audio"
- assert captured["json"]["messages"][1]["content"][0]["input_audio"]["data"] == "ZmFrZQ=="
+ assert (
+ captured["json"]["messages"][1]["content"][0]["input_audio"]["data"]
+ == "ZmFrZQ=="
+ )
assert captured["json"]["messages"][1]["content"][1]["text"] == "user prompt"
From 49c835853a82f6df0d8068ba5890cdcad0be30b4 Mon Sep 17 00:00:00 2001
From: Richard Liu <2645345468@qq.com>
Date: Tue, 31 Mar 2026 01:43:44 +0800
Subject: [PATCH 5/5] docs: add Xiaomi MiMo Omni and TTS services to multiple
language READMEs
---
README.md | 2 ++
README_fr.md | 2 ++
README_ja.md | 2 ++
README_ru.md | 2 ++
README_zh-TW.md | 2 ++
README_zh.md | 2 ++
6 files changed, 12 insertions(+)
diff --git a/README.md b/README.md
index 469b8c4020..a901c5e137 100644
--- a/README.md
+++ b/README.md
@@ -184,6 +184,7 @@ Connect AstrBot to your favorite chat platform.
| Coze | LLMOps Platforms |
| OpenAI Whisper | Speech-to-Text Services |
| SenseVoice | Speech-to-Text Services |
+| Xiaomi MiMo Omni | Speech-to-Text Services |
| OpenAI TTS | Text-to-Speech Services |
| Gemini TTS | Text-to-Speech Services |
| GPT-Sovits-Inference | Text-to-Speech Services |
@@ -193,6 +194,7 @@ Connect AstrBot to your favorite chat platform.
| Alibaba Cloud Bailian TTS | Text-to-Speech Services |
| Azure TTS | Text-to-Speech Services |
| Minimax TTS | Text-to-Speech Services |
+| Xiaomi MiMo TTS | Text-to-Speech Services |
| Volcano Engine TTS | Text-to-Speech Services |
## ❤️ Sponsors
diff --git a/README_fr.md b/README_fr.md
index 31ab1f3835..979ba62f67 100644
--- a/README_fr.md
+++ b/README_fr.md
@@ -184,6 +184,7 @@ Connectez AstrBot à vos plateformes de chat préférées.
| Coze | Plateformes LLMOps |
| OpenAI Whisper | Services de reconnaissance vocale |
| SenseVoice | Services de reconnaissance vocale |
+| Xiaomi MiMo Omni | Services de reconnaissance vocale |
| OpenAI TTS | Services de synthèse vocale |
| Gemini TTS | Services de synthèse vocale |
| GPT-Sovits-Inference | Services de synthèse vocale |
@@ -193,6 +194,7 @@ Connectez AstrBot à vos plateformes de chat préférées.
| Alibaba Cloud Bailian TTS | Services de synthèse vocale |
| Azure TTS | Services de synthèse vocale |
| Minimax TTS | Services de synthèse vocale |
+| Xiaomi MiMo TTS | Services de synthèse vocale |
| Volcano Engine TTS | Services de synthèse vocale |
## ❤️ Contribuer
diff --git a/README_ja.md b/README_ja.md
index bf114b2d8a..0e6d2dba49 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -185,6 +185,7 @@ AstrBot をよく使うチャットプラットフォームに接続できます
| Coze | LLMOps プラットフォーム |
| OpenAI Whisper | 音声認識サービス |
| SenseVoice | 音声認識サービス |
+| Xiaomi MiMo Omni | 音声認識サービス |
| OpenAI TTS | 音声合成サービス |
| Gemini TTS | 音声合成サービス |
| GPT-Sovits-Inference | 音声合成サービス |
@@ -194,6 +195,7 @@ AstrBot をよく使うチャットプラットフォームに接続できます
| Alibaba Cloud 百炼 TTS | 音声合成サービス |
| Azure TTS | 音声合成サービス |
| Minimax TTS | 音声合成サービス |
+| Xiaomi MiMo TTS | 音声合成サービス |
| Volcano Engine TTS | 音声合成サービス |
## ❤️ コントリビューション
diff --git a/README_ru.md b/README_ru.md
index ed309a9ee8..64d742c5a3 100644
--- a/README_ru.md
+++ b/README_ru.md
@@ -184,6 +184,7 @@ yay -S astrbot-git
| Coze | Платформы LLMOps |
| OpenAI Whisper | Сервисы распознавания речи |
| SenseVoice | Сервисы распознавания речи |
+| Xiaomi MiMo Omni | Сервисы распознавания речи |
| OpenAI TTS | Сервисы синтеза речи |
| Gemini TTS | Сервисы синтеза речи |
| GPT-Sovits-Inference | Сервисы синтеза речи |
@@ -193,6 +194,7 @@ yay -S astrbot-git
| Alibaba Cloud Bailian TTS | Сервисы синтеза речи |
| Azure TTS | Сервисы синтеза речи |
| Minimax TTS | Сервисы синтеза речи |
+| Xiaomi MiMo TTS | Сервисы синтеза речи |
| Volcano Engine TTS | Сервисы синтеза речи |
## ❤️ Вклад в проект
diff --git a/README_zh-TW.md b/README_zh-TW.md
index 89cce05e7f..5bfea0d279 100644
--- a/README_zh-TW.md
+++ b/README_zh-TW.md
@@ -184,6 +184,7 @@ yay -S astrbot-git
| Coze | LLMOps 平台 |
| OpenAI Whisper | 語音轉文字服務 |
| SenseVoice | 語音轉文字服務 |
+| Xiaomi MiMo Omni | 語音轉文字服務 |
| OpenAI TTS | 文字轉語音服務 |
| Gemini TTS | 文字轉語音服務 |
| GPT-Sovits-Inference | 文字轉語音服務 |
@@ -193,6 +194,7 @@ yay -S astrbot-git
| 阿里雲百煉 TTS | 文字轉語音服務 |
| Azure TTS | 文字轉語音服務 |
| Minimax TTS | 文字轉語音服務 |
+| Xiaomi MiMo TTS | 文字轉語音服務 |
| 火山引擎 TTS | 文字轉語音服務 |
## ❤️ 貢獻
diff --git a/README_zh.md b/README_zh.md
index 41099b069c..c852033524 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -185,6 +185,7 @@ yay -S astrbot-git
| Coze | LLMOps 平台 |
| OpenAI Whisper | 语音转文本 |
| SenseVoice | 语音转文本 |
+| Xiaomi MiMo Omni | 语音转文本 |
| OpenAI TTS | 文本转语音 |
| Gemini TTS | 文本转语音 |
| GPT-Sovits-Inference | 文本转语音 |
@@ -194,6 +195,7 @@ yay -S astrbot-git
| 阿里云百炼 TTS | 文本转语音 |
| Azure TTS | 文本转语音 |
| Minimax TTS | 文本转语音 |
+| Xiaomi MiMo TTS | 文本转语音 |
| 火山引擎 TTS | 文本转语音 |
## ❤️ 贡献