From d519daf9b4647c76a061b436317340c260437bbb Mon Sep 17 00:00:00 2001 From: Richard Liu <2645345468@qq.com> Date: Thu, 19 Mar 2026 23:35:44 +0800 Subject: [PATCH 1/5] feat: add mimo tts provider support --- astrbot/core/config/default.py | 42 +++++ astrbot/core/provider/manager.py | 4 + .../provider/sources/mimo_tts_api_source.py | 149 ++++++++++++++++++ .../src/composables/useProviderSources.ts | 1 + .../en-US/features/config-metadata.json | 22 ++- .../ru-RU/features/config-metadata.json | 22 ++- .../zh-CN/features/config-metadata.json | 22 ++- dashboard/src/utils/providerUtils.js | 1 + 8 files changed, 260 insertions(+), 3 deletions(-) create mode 100644 astrbot/core/provider/sources/mimo_tts_api_source.py diff --git a/astrbot/core/config/default.py b/astrbot/core/config/default.py index 58c1726814..afba850193 100644 --- a/astrbot/core/config/default.py +++ b/astrbot/core/config/default.py @@ -1461,6 +1461,23 @@ class ChatProviderTemplate(TypedDict): "timeout": "20", "proxy": "", }, + "MiMo TTS(API)": { + "id": "mimo_tts", + "type": "mimo_tts_api", + "provider": "mimo", + "provider_type": "text_to_speech", + "enable": False, + "api_key": "", + "api_base": "https://api.xiaomimimo.com/v1", + "model": "mimo-v2-tts", + "mimo-tts-voice": "mimo_default", + "mimo-tts-format": "wav", + "mimo-tts-style-prompt": "", + "mimo-tts-dialect": "", + "mimo-tts-seed-text": "Hello, MiMo, have you had lunch?", + "timeout": "20", + "proxy": "", + }, "Genie TTS": { "id": "genie_tts", "provider": "genie_tts", @@ -2314,6 +2331,31 @@ class ChatProviderTemplate(TypedDict): "type": "string", "hint": "OpenAI TTS 的声音。OpenAI 默认支持:'alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'", }, + "mimo-tts-voice": { + "description": "音色", + "type": "string", + "hint": "MiMo TTS 的音色名称。默认值为 'mimo_default'。", + }, + "mimo-tts-format": { + "description": "输出格式", + "type": "string", + "hint": "MiMo TTS 生成音频的格式,例如 'wav'。", + }, + "mimo-tts-style-prompt": { + "description": "风格提示词", + "type": "string", + "hint": "用于控制生成语音的说话风格、语气或情绪,例如温柔、活泼、沉稳等。可留空。", + }, + "mimo-tts-dialect": { + "description": "方言", + "type": "string", + "hint": "指定生成语音时使用的方言或口音,例如四川话、粤语口音等。可留空。", + }, + "mimo-tts-seed-text": { + "description": "种子文本", + "type": "string", + "hint": "用于引导音色和说话方式的参考文本,会影响生成语音的表达风格。", + }, "fishaudio-tts-character": { "description": "character", "type": "string", diff --git a/astrbot/core/provider/manager.py b/astrbot/core/provider/manager.py index 0df9f791ae..62e1fcd80f 100644 --- a/astrbot/core/provider/manager.py +++ b/astrbot/core/provider/manager.py @@ -399,6 +399,10 @@ def dynamic_import_provider(self, type: str) -> None: from .sources.openai_tts_api_source import ( ProviderOpenAITTSAPI as ProviderOpenAITTSAPI, ) + case "mimo_tts_api": + from .sources.mimo_tts_api_source import ( + ProviderMiMoTTSAPI as ProviderMiMoTTSAPI, + ) case "genie_tts": from .sources.genie_tts import ( GenieTTSProvider as GenieTTSProvider, diff --git a/astrbot/core/provider/sources/mimo_tts_api_source.py b/astrbot/core/provider/sources/mimo_tts_api_source.py new file mode 100644 index 0000000000..a9240203dc --- /dev/null +++ b/astrbot/core/provider/sources/mimo_tts_api_source.py @@ -0,0 +1,149 @@ +import base64 +import uuid +from pathlib import Path + +import httpx + +from ..entities import ProviderType +from ..provider import TTSProvider +from ..register import register_provider_adapter +from astrbot import logger +from astrbot.core.utils.astrbot_path import get_astrbot_temp_path + + +def normalize_timeout(timeout: int | str | None) -> int | None: + if timeout in (None, ""): + return None + if isinstance(timeout, str): + return int(timeout) + return timeout + + +def build_headers(api_key: str) -> dict[str, str]: + headers = {"Content-Type": "application/json"} + if api_key: + headers["api-key"] = api_key + headers["Authorization"] = f"Bearer {api_key}" + return headers + + +def get_temp_dir() -> Path: + temp_dir = Path(get_astrbot_temp_path()) + temp_dir.mkdir(parents=True, exist_ok=True) + return temp_dir + + +def create_http_client(timeout: int | None, proxy: str) -> httpx.AsyncClient: + client_kwargs: dict[str, object] = { + "timeout": timeout, + "follow_redirects": True, + } + if proxy: + logger.info("[MiMo API] Using proxy: %s", proxy) + client_kwargs["proxy"] = proxy + return httpx.AsyncClient(**client_kwargs) + + +def build_api_url(api_base: str) -> str: + normalized_api_base = api_base.rstrip("/") + if normalized_api_base.endswith("/chat/completions"): + return normalized_api_base + return normalized_api_base + "/chat/completions" + + +@register_provider_adapter( + "mimo_tts_api", + "MiMo TTS API", + provider_type=ProviderType.TEXT_TO_SPEECH, +) +class ProviderMiMoTTSAPI(TTSProvider): + def __init__( + self, + provider_config: dict, + provider_settings: dict, + ) -> None: + super().__init__(provider_config, provider_settings) + self.chosen_api_key = provider_config.get("api_key", "") + self.api_base = provider_config.get( + "api_base", + "https://api.xiaomimimo.com/v1", + ) + self.proxy = provider_config.get("proxy", "") + self.timeout = normalize_timeout(provider_config.get("timeout", 20)) + self.voice = provider_config.get("mimo-tts-voice", "mimo_default") + self.audio_format = provider_config.get("mimo-tts-format", "wav") + self.style_prompt = provider_config.get("mimo-tts-style-prompt", "") + self.dialect = provider_config.get("mimo-tts-dialect", "") + self.seed_text = provider_config.get( + "mimo-tts-seed-text", + "Hello, MiMo, have you had lunch?", + ) + self.set_model(provider_config.get("model", "mimo-v2-tts")) + self.client = create_http_client(self.timeout, self.proxy) + + def _build_user_prompt(self) -> str: + prompt_parts: list[str] = [] + + if self.style_prompt.strip(): + prompt_parts.append(self.style_prompt.strip()) + if self.dialect.strip(): + prompt_parts.append(f"Please use {self.dialect.strip()} when speaking.") + + if not prompt_parts: + return self.seed_text + + if self.seed_text.strip(): + prompt_parts.append(self.seed_text.strip()) + + return " ".join(prompt_parts) + + def _build_payload(self, text: str) -> dict: + return { + "model": self.model_name, + "messages": [ + { + "role": "user", + "content": self._build_user_prompt(), + }, + { + "role": "assistant", + "content": text, + }, + ], + "audio": { + "format": self.audio_format, + "voice": self.voice, + }, + } + + async def get_audio(self, text: str) -> str: + response = await self.client.post( + build_api_url(self.api_base), + headers=build_headers(self.chosen_api_key), + json=self._build_payload(text), + ) + + try: + response.raise_for_status() + except Exception as exc: + error_text = response.text[:1024] + raise Exception( + f"MiMo TTS API request failed: HTTP {response.status_code}, response: {error_text}" + ) from exc + + data = response.json() + audio_data = ( + data.get("choices", [{}])[0].get("message", {}).get("audio", {}).get("data") + ) + if not audio_data: + raise Exception(f"MiMo TTS API returned no audio payload: {data}") + + output_path = ( + get_temp_dir() / f"mimo_tts_api_{uuid.uuid4()}.{self.audio_format}" + ) + output_path.write_bytes(base64.b64decode(audio_data)) + return str(output_path) + + async def terminate(self): + if self.client: + await self.client.aclose() diff --git a/dashboard/src/composables/useProviderSources.ts b/dashboard/src/composables/useProviderSources.ts index 53959ca3e8..7ff12190e3 100644 --- a/dashboard/src/composables/useProviderSources.ts +++ b/dashboard/src/composables/useProviderSources.ts @@ -325,6 +325,7 @@ export function useProviderSources(options: UseProviderSourcesOptions) { openai_whisper_selfhost: 'speech_to_text', sensevoice_stt_selfhost: 'speech_to_text', openai_tts_api: 'text_to_speech', + mimo_tts_api: 'text_to_speech', edge_tts: 'text_to_speech', gsvi_tts_api: 'text_to_speech', fishaudio_tts_api: 'text_to_speech', diff --git a/dashboard/src/i18n/locales/en-US/features/config-metadata.json b/dashboard/src/i18n/locales/en-US/features/config-metadata.json index a51dfefbe4..005559a5a4 100644 --- a/dashboard/src/i18n/locales/en-US/features/config-metadata.json +++ b/dashboard/src/i18n/locales/en-US/features/config-metadata.json @@ -1391,6 +1391,26 @@ "description": "voice", "hint": "OpenAI TTS voice. OpenAI defaults: 'alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'." }, + "mimo-tts-voice": { + "description": "Voice", + "hint": "MiMo TTS voice name. Default is 'mimo_default'." + }, + "mimo-tts-format": { + "description": "Output format", + "hint": "Audio format generated by MiMo TTS, for example 'wav'." + }, + "mimo-tts-style-prompt": { + "description": "Style prompt", + "hint": "Guides speaking style, tone, or emotion such as gentle, lively, or calm. Optional." + }, + "mimo-tts-dialect": { + "description": "Dialect", + "hint": "Target dialect or accent for generated speech, such as Sichuan dialect. Optional." + }, + "mimo-tts-seed-text": { + "description": "Seed text", + "hint": "Reference text used to guide voice characteristics and speaking style." + }, "fishaudio-tts-character": { "description": "character", "hint": "Fishaudio TTS character. Default is Klee. More roles: https://fish.audio/zh-CN/discovery" @@ -1518,4 +1538,4 @@ "helpMiddle": "or", "helpSuffix": "." } -} \ No newline at end of file +} diff --git a/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json b/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json index 56d12c9838..cf0e97acd8 100644 --- a/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json +++ b/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json @@ -1396,6 +1396,26 @@ "description": "API Base URL", "hint": "Голоса OpenAI TTS: alloy, echo и др." }, + "mimo-tts-voice": { + "description": "Голос", + "hint": "Имя голоса MiMo TTS. По умолчанию 'mimo_default'." + }, + "mimo-tts-format": { + "description": "Формат вывода", + "hint": "Формат аудио, создаваемого MiMo TTS, например 'wav'." + }, + "mimo-tts-style-prompt": { + "description": "Подсказка стиля", + "hint": "Задает стиль речи, тон или эмоцию, например мягкий, живой или спокойный. Необязательно." + }, + "mimo-tts-dialect": { + "description": "Диалект", + "hint": "Диалект или акцент для синтезируемой речи, например сычуаньский диалект. Необязательно." + }, + "mimo-tts-seed-text": { + "description": "Начальный текст", + "hint": "Эталонный текст, который помогает задать особенности голоса и манеру речи." + }, "fishaudio-tts-character": { "description": "Персонаж", "hint": "Персонаж Fishaudio. По умолчанию Klee." @@ -1523,4 +1543,4 @@ "helpMiddle": "или", "helpSuffix": "." } -} \ No newline at end of file +} diff --git a/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json b/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json index 69c1746c6e..640bf27569 100644 --- a/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json +++ b/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json @@ -1393,6 +1393,26 @@ "description": "voice", "hint": "OpenAI TTS 的声音。OpenAI 默认支持:'alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'" }, + "mimo-tts-voice": { + "description": "音色", + "hint": "MiMo TTS 的音色名称。默认值为 'mimo_default'。" + }, + "mimo-tts-format": { + "description": "输出格式", + "hint": "MiMo TTS 生成音频的格式,例如 'wav'。" + }, + "mimo-tts-style-prompt": { + "description": "风格提示词", + "hint": "用于控制生成语音的说话风格、语气或情绪,例如温柔、活泼、沉稳等。可留空。" + }, + "mimo-tts-dialect": { + "description": "方言", + "hint": "指定生成语音时使用的方言或口音,例如四川话、粤语口音等。可留空。" + }, + "mimo-tts-seed-text": { + "description": "种子文本", + "hint": "用于引导音色和说话方式的参考文本,会影响生成语音的表达风格。" + }, "fishaudio-tts-character": { "description": "character", "hint": "fishaudio TTS 的角色。默认为可莉。更多角色请访问:https://fish.audio/zh-CN/discovery" @@ -1520,4 +1540,4 @@ "helpMiddle": "或", "helpSuffix": "。" } -} \ No newline at end of file +} diff --git a/dashboard/src/utils/providerUtils.js b/dashboard/src/utils/providerUtils.js index 4bfe3ea6e9..273b921445 100644 --- a/dashboard/src/utils/providerUtils.js +++ b/dashboard/src/utils/providerUtils.js @@ -30,6 +30,7 @@ export function getProviderIcon(type) { 'lm_studio': 'https://cdn.jsdelivr.net/npm/@lobehub/icons-static-svg@latest/icons/lmstudio.svg', 'fishaudio': 'https://cdn.jsdelivr.net/npm/@lobehub/icons-static-svg@latest/icons/fishaudio.svg', 'minimax': 'https://cdn.jsdelivr.net/npm/@lobehub/icons-static-svg@latest/icons/minimax.svg', + 'mimo': 'https://platform.xiaomimimo.com/favicon.874c9507.png', '302ai': 'https://cdn.jsdelivr.net/npm/@lobehub/icons-static-svg@1.53.0/icons/ai302-color.svg', 'microsoft': 'https://cdn.jsdelivr.net/npm/@lobehub/icons-static-svg@latest/icons/microsoft.svg', 'vllm': 'https://cdn.jsdelivr.net/npm/@lobehub/icons-static-svg@latest/icons/vllm.svg', From a8236bae1a6e02cc9bd2509f48223b1d2b5b6877 Mon Sep 17 00:00:00 2001 From: Richard Liu <2645345468@qq.com> Date: Thu, 19 Mar 2026 23:59:30 +0800 Subject: [PATCH 2/5] fix: handle empty mimo tts choices --- astrbot/core/provider/sources/mimo_tts_api_source.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/astrbot/core/provider/sources/mimo_tts_api_source.py b/astrbot/core/provider/sources/mimo_tts_api_source.py index a9240203dc..82d6a4ab8c 100644 --- a/astrbot/core/provider/sources/mimo_tts_api_source.py +++ b/astrbot/core/provider/sources/mimo_tts_api_source.py @@ -132,9 +132,10 @@ async def get_audio(self, text: str) -> str: ) from exc data = response.json() - audio_data = ( - data.get("choices", [{}])[0].get("message", {}).get("audio", {}).get("data") - ) + choices = data.get("choices") or [] + first_choice = choices[0] if choices else {} + message = first_choice.get("message", {}) + audio_data = message.get("audio", {}).get("data") if not audio_data: raise Exception(f"MiMo TTS API returned no audio payload: {data}") From db30538fb3aa5babd6a849bfd3d3fc77f8d98f68 Mon Sep 17 00:00:00 2001 From: Richard Liu <2645345468@qq.com> Date: Fri, 20 Mar 2026 00:48:43 +0800 Subject: [PATCH 3/5] feat: add mimo stt provider support --- astrbot/core/config/default.py | 28 ++- astrbot/core/provider/manager.py | 4 + .../core/provider/sources/mimo_api_common.py | 129 ++++++++++++++ .../provider/sources/mimo_stt_api_source.py | 100 +++++++++++ .../provider/sources/mimo_tts_api_source.py | 73 ++------ .../src/composables/useProviderSources.ts | 1 + .../en-US/features/config-metadata.json | 12 +- .../ru-RU/features/config-metadata.json | 12 +- .../zh-CN/features/config-metadata.json | 12 +- tests/test_mimo_api_sources.py | 167 ++++++++++++++++++ 10 files changed, 475 insertions(+), 63 deletions(-) create mode 100644 astrbot/core/provider/sources/mimo_api_common.py create mode 100644 astrbot/core/provider/sources/mimo_stt_api_source.py create mode 100644 tests/test_mimo_api_sources.py diff --git a/astrbot/core/config/default.py b/astrbot/core/config/default.py index afba850193..a37e9eadf0 100644 --- a/astrbot/core/config/default.py +++ b/astrbot/core/config/default.py @@ -1431,6 +1431,20 @@ class ChatProviderTemplate(TypedDict): "model": "whisper-1", "proxy": "", }, + "MiMo STT(API)": { + "id": "mimo_stt", + "provider": "mimo", + "type": "mimo_stt_api", + "provider_type": "speech_to_text", + "enable": False, + "api_key": "", + "api_base": "https://api.xiaomimimo.com/v1", + "model": "mimo-v2-omni", + "mimo-stt-system-prompt": "You are a speech transcription assistant. Transcribe the spoken content from the audio exactly and return only the transcription text.", + "mimo-stt-user-prompt": "Please transcribe the content of the audio and return only the transcription text.", + "timeout": "20", + "proxy": "", + }, "Whisper(Local)": { "provider": "openai", "type": "openai_whisper_selfhost", @@ -2326,6 +2340,16 @@ class ChatProviderTemplate(TypedDict): "type": "int", "hint": "超时时间,单位为秒。", }, + "mimo-stt-system-prompt": { + "description": "系统提示词", + "type": "string", + "hint": "用于指导 MiMo STT 转录行为的 system prompt。", + }, + "mimo-stt-user-prompt": { + "description": "用户提示词", + "type": "string", + "hint": "附加给 MiMo STT 的用户提示词,用于约束返回结果格式。", + }, "openai-tts-voice": { "description": "voice", "type": "string", @@ -2334,12 +2358,12 @@ class ChatProviderTemplate(TypedDict): "mimo-tts-voice": { "description": "音色", "type": "string", - "hint": "MiMo TTS 的音色名称。默认值为 'mimo_default'。", + "hint": "MiMo TTS 的音色名称。可选值包括 'mimo_default'、'default_en'、'default_zh'。", }, "mimo-tts-format": { "description": "输出格式", "type": "string", - "hint": "MiMo TTS 生成音频的格式,例如 'wav'。", + "hint": "MiMo TTS 生成音频的格式。支持 'wav'、'mp3'、'pcm'。", }, "mimo-tts-style-prompt": { "description": "风格提示词", diff --git a/astrbot/core/provider/manager.py b/astrbot/core/provider/manager.py index 62e1fcd80f..a294a51e17 100644 --- a/astrbot/core/provider/manager.py +++ b/astrbot/core/provider/manager.py @@ -387,6 +387,10 @@ def dynamic_import_provider(self, type: str) -> None: from .sources.whisper_api_source import ( ProviderOpenAIWhisperAPI as ProviderOpenAIWhisperAPI, ) + case "mimo_stt_api": + from .sources.mimo_stt_api_source import ( + ProviderMiMoSTTAPI as ProviderMiMoSTTAPI, + ) case "openai_whisper_selfhost": from .sources.whisper_selfhosted_source import ( ProviderOpenAIWhisperSelfHost as ProviderOpenAIWhisperSelfHost, diff --git a/astrbot/core/provider/sources/mimo_api_common.py b/astrbot/core/provider/sources/mimo_api_common.py new file mode 100644 index 0000000000..d3bf75e66d --- /dev/null +++ b/astrbot/core/provider/sources/mimo_api_common.py @@ -0,0 +1,129 @@ +import base64 +import uuid +from pathlib import Path +from urllib.parse import urlparse + +import httpx + +from astrbot import logger +from astrbot.core.utils.astrbot_path import get_astrbot_temp_path +from astrbot.core.utils.io import download_file +from astrbot.core.utils.tencent_record_helper import ( + convert_to_pcm_wav, + tencent_silk_to_wav, +) + +DEFAULT_MIMO_API_BASE = "https://api.xiaomimimo.com/v1" +DEFAULT_MIMO_TTS_MODEL = "mimo-v2-tts" +DEFAULT_MIMO_TTS_VOICE = "mimo_default" +DEFAULT_MIMO_TTS_SEED_TEXT = "Hello, MiMo, have you had lunch?" +DEFAULT_MIMO_STT_MODEL = "mimo-v2-omni" +DEFAULT_MIMO_STT_SYSTEM_PROMPT = ( + "You are a speech transcription assistant. " + "Transcribe the spoken content from the audio exactly and return only the transcription text." +) +DEFAULT_MIMO_STT_USER_PROMPT = ( + "Please transcribe the content of the audio and return only the transcription text." +) + + +class MiMoAPIError(Exception): + pass + + +def normalize_timeout(timeout: int | str | None) -> int | None: + if timeout in (None, ""): + return None + if isinstance(timeout, str): + return int(timeout) + return timeout + + +def build_headers(api_key: str) -> dict[str, str]: + headers = {"Content-Type": "application/json"} + if api_key: + headers["Authorization"] = f"Bearer {api_key}" + return headers + + +def get_temp_dir() -> Path: + temp_dir = Path(get_astrbot_temp_path()) + temp_dir.mkdir(parents=True, exist_ok=True) + return temp_dir + + +def create_http_client(timeout: int | None, proxy: str) -> httpx.AsyncClient: + client_kwargs: dict[str, object] = { + "timeout": timeout, + "follow_redirects": True, + } + if proxy: + logger.info("[MiMo API] Using proxy: %s", proxy) + client_kwargs["proxy"] = proxy + return httpx.AsyncClient(**client_kwargs) + + +def build_api_url(api_base: str) -> str: + normalized_api_base = api_base.rstrip("/") + if normalized_api_base.endswith("/chat/completions"): + return normalized_api_base + return normalized_api_base + "/chat/completions" + + +async def _detect_audio_format(file_path: Path) -> str | None: + silk_header = b"SILK" + amr_header = b"#!AMR" + + try: + with file_path.open("rb") as file: + file_header = file.read(8) + except FileNotFoundError: + return None + + if silk_header in file_header: + return "silk" + if amr_header in file_header: + return "amr" + return None + + +async def prepare_audio_input(audio_source: str) -> tuple[str, list[Path]]: + cleanup_paths: list[Path] = [] + source_path = Path(audio_source) + is_remote = audio_source.startswith(("http://", "https://")) + is_tencent = "multimedia.nt.qq.com.cn" in audio_source if is_remote else False + + if is_remote: + parsed_url = urlparse(audio_source) + suffix = Path(parsed_url.path).suffix or ".input" + download_path = get_temp_dir() / f"mimo_audio_{uuid.uuid4().hex[:8]}{suffix}" + await download_file(audio_source, str(download_path)) + source_path = download_path + cleanup_paths.append(download_path) + + if not source_path.exists(): + raise FileNotFoundError(f"File does not exist: {source_path}") + + if source_path.suffix.lower() in {".amr", ".silk"} or is_tencent: + file_format = await _detect_audio_format(source_path) + if file_format in {"silk", "amr"}: + converted_path = get_temp_dir() / f"mimo_audio_{uuid.uuid4().hex[:8]}.wav" + cleanup_paths.append(converted_path) + if file_format == "silk": + logger.info("Converting silk file to wav for MiMo STT...") + await tencent_silk_to_wav(str(source_path), str(converted_path)) + else: + logger.info("Converting amr file to wav for MiMo STT...") + await convert_to_pcm_wav(str(source_path), str(converted_path)) + source_path = converted_path + + encoded_audio = base64.b64encode(source_path.read_bytes()).decode("utf-8") + return encoded_audio, cleanup_paths + + +def cleanup_files(paths: list[Path]) -> None: + for path in paths: + try: + path.unlink(missing_ok=True) + except Exception as exc: + logger.warning("Failed to remove temporary MiMo file %s: %s", path, exc) diff --git a/astrbot/core/provider/sources/mimo_stt_api_source.py b/astrbot/core/provider/sources/mimo_stt_api_source.py new file mode 100644 index 0000000000..9b03e2efc6 --- /dev/null +++ b/astrbot/core/provider/sources/mimo_stt_api_source.py @@ -0,0 +1,100 @@ +from ..entities import ProviderType +from ..provider import STTProvider +from ..register import register_provider_adapter +from .mimo_api_common import ( + DEFAULT_MIMO_API_BASE, + DEFAULT_MIMO_STT_MODEL, + DEFAULT_MIMO_STT_SYSTEM_PROMPT, + DEFAULT_MIMO_STT_USER_PROMPT, + MiMoAPIError, + build_api_url, + build_headers, + cleanup_files, + create_http_client, + normalize_timeout, + prepare_audio_input, +) + + +@register_provider_adapter( + "mimo_stt_api", + "MiMo STT API", + provider_type=ProviderType.SPEECH_TO_TEXT, +) +class ProviderMiMoSTTAPI(STTProvider): + def __init__( + self, + provider_config: dict, + provider_settings: dict, + ) -> None: + super().__init__(provider_config, provider_settings) + self.chosen_api_key = provider_config.get("api_key", "") + self.api_base = provider_config.get("api_base", DEFAULT_MIMO_API_BASE) + self.proxy = provider_config.get("proxy", "") + self.timeout = normalize_timeout(provider_config.get("timeout", 20)) + self.system_prompt = provider_config.get( + "mimo-stt-system-prompt", + DEFAULT_MIMO_STT_SYSTEM_PROMPT, + ) + self.user_prompt = provider_config.get( + "mimo-stt-user-prompt", + DEFAULT_MIMO_STT_USER_PROMPT, + ) + self.set_model(provider_config.get("model", DEFAULT_MIMO_STT_MODEL)) + self.client = create_http_client(self.timeout, self.proxy) + + async def get_text(self, audio_url: str) -> str: + audio_data_url, cleanup_paths = await prepare_audio_input(audio_url) + payload = { + "model": self.model_name, + "messages": [ + { + "role": "system", + "content": self.system_prompt, + }, + { + "role": "user", + "content": [ + { + "type": "input_audio", + "input_audio": { + "data": audio_data_url, + }, + }, + { + "type": "text", + "text": self.user_prompt, + }, + ], + }, + ], + "max_completion_tokens": 1024, + } + + try: + response = await self.client.post( + build_api_url(self.api_base), + headers=build_headers(self.chosen_api_key), + json=payload, + ) + try: + response.raise_for_status() + except Exception as exc: + error_text = response.text[:1024] + raise MiMoAPIError( + f"MiMo STT API request failed: HTTP {response.status_code}, response: {error_text}" + ) from exc + + data = response.json() + choices = data.get("choices") or [] + first_choice = choices[0] if choices else {} + content = first_choice.get("message", {}).get("content", "") + if not isinstance(content, str) or not content.strip(): + raise MiMoAPIError(f"MiMo STT API returned empty transcription: {data}") + return content.strip() + finally: + cleanup_files(cleanup_paths) + + async def terminate(self): + if self.client: + await self.client.aclose() diff --git a/astrbot/core/provider/sources/mimo_tts_api_source.py b/astrbot/core/provider/sources/mimo_tts_api_source.py index 82d6a4ab8c..daad55e1e6 100644 --- a/astrbot/core/provider/sources/mimo_tts_api_source.py +++ b/astrbot/core/provider/sources/mimo_tts_api_source.py @@ -1,54 +1,21 @@ import base64 import uuid -from pathlib import Path - -import httpx from ..entities import ProviderType from ..provider import TTSProvider from ..register import register_provider_adapter -from astrbot import logger -from astrbot.core.utils.astrbot_path import get_astrbot_temp_path - - -def normalize_timeout(timeout: int | str | None) -> int | None: - if timeout in (None, ""): - return None - if isinstance(timeout, str): - return int(timeout) - return timeout - - -def build_headers(api_key: str) -> dict[str, str]: - headers = {"Content-Type": "application/json"} - if api_key: - headers["api-key"] = api_key - headers["Authorization"] = f"Bearer {api_key}" - return headers - - -def get_temp_dir() -> Path: - temp_dir = Path(get_astrbot_temp_path()) - temp_dir.mkdir(parents=True, exist_ok=True) - return temp_dir - - -def create_http_client(timeout: int | None, proxy: str) -> httpx.AsyncClient: - client_kwargs: dict[str, object] = { - "timeout": timeout, - "follow_redirects": True, - } - if proxy: - logger.info("[MiMo API] Using proxy: %s", proxy) - client_kwargs["proxy"] = proxy - return httpx.AsyncClient(**client_kwargs) - - -def build_api_url(api_base: str) -> str: - normalized_api_base = api_base.rstrip("/") - if normalized_api_base.endswith("/chat/completions"): - return normalized_api_base - return normalized_api_base + "/chat/completions" +from .mimo_api_common import ( + DEFAULT_MIMO_API_BASE, + DEFAULT_MIMO_TTS_MODEL, + DEFAULT_MIMO_TTS_SEED_TEXT, + DEFAULT_MIMO_TTS_VOICE, + MiMoAPIError, + build_api_url, + build_headers, + create_http_client, + get_temp_dir, + normalize_timeout, +) @register_provider_adapter( @@ -64,21 +31,17 @@ def __init__( ) -> None: super().__init__(provider_config, provider_settings) self.chosen_api_key = provider_config.get("api_key", "") - self.api_base = provider_config.get( - "api_base", - "https://api.xiaomimimo.com/v1", - ) + self.api_base = provider_config.get("api_base", DEFAULT_MIMO_API_BASE) self.proxy = provider_config.get("proxy", "") self.timeout = normalize_timeout(provider_config.get("timeout", 20)) - self.voice = provider_config.get("mimo-tts-voice", "mimo_default") + self.voice = provider_config.get("mimo-tts-voice", DEFAULT_MIMO_TTS_VOICE) self.audio_format = provider_config.get("mimo-tts-format", "wav") self.style_prompt = provider_config.get("mimo-tts-style-prompt", "") self.dialect = provider_config.get("mimo-tts-dialect", "") self.seed_text = provider_config.get( - "mimo-tts-seed-text", - "Hello, MiMo, have you had lunch?", + "mimo-tts-seed-text", DEFAULT_MIMO_TTS_SEED_TEXT ) - self.set_model(provider_config.get("model", "mimo-v2-tts")) + self.set_model(provider_config.get("model", DEFAULT_MIMO_TTS_MODEL)) self.client = create_http_client(self.timeout, self.proxy) def _build_user_prompt(self) -> str: @@ -127,7 +90,7 @@ async def get_audio(self, text: str) -> str: response.raise_for_status() except Exception as exc: error_text = response.text[:1024] - raise Exception( + raise MiMoAPIError( f"MiMo TTS API request failed: HTTP {response.status_code}, response: {error_text}" ) from exc @@ -137,7 +100,7 @@ async def get_audio(self, text: str) -> str: message = first_choice.get("message", {}) audio_data = message.get("audio", {}).get("data") if not audio_data: - raise Exception(f"MiMo TTS API returned no audio payload: {data}") + raise MiMoAPIError(f"MiMo TTS API returned no audio payload: {data}") output_path = ( get_temp_dir() / f"mimo_tts_api_{uuid.uuid4()}.{self.audio_format}" diff --git a/dashboard/src/composables/useProviderSources.ts b/dashboard/src/composables/useProviderSources.ts index 7ff12190e3..e646d59cc0 100644 --- a/dashboard/src/composables/useProviderSources.ts +++ b/dashboard/src/composables/useProviderSources.ts @@ -322,6 +322,7 @@ export function useProviderSources(options: UseProviderSourcesOptions) { coze: 'agent_runner', dashscope: 'chat_completion', openai_whisper_api: 'speech_to_text', + mimo_stt_api: 'speech_to_text', openai_whisper_selfhost: 'speech_to_text', sensevoice_stt_selfhost: 'speech_to_text', openai_tts_api: 'text_to_speech', diff --git a/dashboard/src/i18n/locales/en-US/features/config-metadata.json b/dashboard/src/i18n/locales/en-US/features/config-metadata.json index 005559a5a4..12b5af7f26 100644 --- a/dashboard/src/i18n/locales/en-US/features/config-metadata.json +++ b/dashboard/src/i18n/locales/en-US/features/config-metadata.json @@ -1387,17 +1387,25 @@ "description": "Timeout", "hint": "Timeout in seconds." }, + "mimo-stt-system-prompt": { + "description": "System prompt", + "hint": "System prompt used to guide MiMo STT transcription behavior." + }, + "mimo-stt-user-prompt": { + "description": "User prompt", + "hint": "Additional user prompt sent to MiMo STT to constrain the returned transcription format." + }, "openai-tts-voice": { "description": "voice", "hint": "OpenAI TTS voice. OpenAI defaults: 'alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'." }, "mimo-tts-voice": { "description": "Voice", - "hint": "MiMo TTS voice name. Default is 'mimo_default'." + "hint": "MiMo TTS voice name. Supported values include 'mimo_default', 'default_en', and 'default_zh'." }, "mimo-tts-format": { "description": "Output format", - "hint": "Audio format generated by MiMo TTS, for example 'wav'." + "hint": "Audio format generated by MiMo TTS. Supported values: 'wav', 'mp3', and 'pcm'." }, "mimo-tts-style-prompt": { "description": "Style prompt", diff --git a/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json b/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json index cf0e97acd8..6424a3f705 100644 --- a/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json +++ b/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json @@ -1392,17 +1392,25 @@ "description": "Таймаут (сек)", "hint": "Максимальное время ожидания ответа." }, + "mimo-stt-system-prompt": { + "description": "Системный промпт", + "hint": "System prompt, который управляет поведением MiMo STT при распознавании." + }, + "mimo-stt-user-prompt": { + "description": "Пользовательский промпт", + "hint": "Дополнительный user prompt для MiMo STT, который помогает задать формат результата." + }, "openai-tts-voice": { "description": "API Base URL", "hint": "Голоса OpenAI TTS: alloy, echo и др." }, "mimo-tts-voice": { "description": "Голос", - "hint": "Имя голоса MiMo TTS. По умолчанию 'mimo_default'." + "hint": "Имя голоса MiMo TTS. Поддерживаются значения 'mimo_default', 'default_en' и 'default_zh'." }, "mimo-tts-format": { "description": "Формат вывода", - "hint": "Формат аудио, создаваемого MiMo TTS, например 'wav'." + "hint": "Формат аудио, создаваемого MiMo TTS. Поддерживаются 'wav', 'mp3' и 'pcm'." }, "mimo-tts-style-prompt": { "description": "Подсказка стиля", diff --git a/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json b/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json index 640bf27569..bde8dab7e7 100644 --- a/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json +++ b/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json @@ -1389,17 +1389,25 @@ "description": "超时时间", "hint": "超时时间,单位为秒。" }, + "mimo-stt-system-prompt": { + "description": "系统提示词", + "hint": "用于指导 MiMo STT 转录行为的 system prompt。" + }, + "mimo-stt-user-prompt": { + "description": "用户提示词", + "hint": "附加给 MiMo STT 的用户提示词,用于约束返回结果格式。" + }, "openai-tts-voice": { "description": "voice", "hint": "OpenAI TTS 的声音。OpenAI 默认支持:'alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'" }, "mimo-tts-voice": { "description": "音色", - "hint": "MiMo TTS 的音色名称。默认值为 'mimo_default'。" + "hint": "MiMo TTS 的音色名称。可选值包括 'mimo_default'、'default_en'、'default_zh'。" }, "mimo-tts-format": { "description": "输出格式", - "hint": "MiMo TTS 生成音频的格式,例如 'wav'。" + "hint": "MiMo TTS 生成音频的格式。支持 'wav'、'mp3'、'pcm'。" }, "mimo-tts-style-prompt": { "description": "风格提示词", diff --git a/tests/test_mimo_api_sources.py b/tests/test_mimo_api_sources.py new file mode 100644 index 0000000000..d262f1c275 --- /dev/null +++ b/tests/test_mimo_api_sources.py @@ -0,0 +1,167 @@ +import asyncio +from types import SimpleNamespace + +import pytest + +from astrbot.core.provider.sources.mimo_api_common import MiMoAPIError, build_headers +from astrbot.core.provider.sources.mimo_stt_api_source import ProviderMiMoSTTAPI +from astrbot.core.provider.sources.mimo_tts_api_source import ProviderMiMoTTSAPI + + +def _make_tts_provider(overrides: dict | None = None) -> ProviderMiMoTTSAPI: + provider_config = { + "id": "test-mimo-tts", + "type": "mimo_tts_api", + "model": "mimo-v2-tts", + "api_key": "test-key", + "mimo-tts-voice": "mimo_default", + "mimo-tts-format": "wav", + "mimo-tts-seed-text": "seed text", + } + if overrides: + provider_config.update(overrides) + return ProviderMiMoTTSAPI(provider_config=provider_config, provider_settings={}) + + +def _make_stt_provider(overrides: dict | None = None) -> ProviderMiMoSTTAPI: + provider_config = { + "id": "test-mimo-stt", + "type": "mimo_stt_api", + "model": "mimo-v2-omni", + "api_key": "test-key", + } + if overrides: + provider_config.update(overrides) + return ProviderMiMoSTTAPI(provider_config=provider_config, provider_settings={}) + + +def test_mimo_tts_prompt_returns_seed_text_when_no_style_or_dialect(): + provider = _make_tts_provider() + try: + assert provider._build_user_prompt() == "seed text" + finally: + asyncio.run(provider.terminate()) + + +def test_mimo_tts_payload_includes_dialect_and_style_prompt(): + provider = _make_tts_provider( + { + "mimo-tts-style-prompt": "Please sound cheerful and lively.", + "mimo-tts-dialect": "Sichuan dialect", + "mimo-tts-seed-text": "You are chatting with a close friend.", + } + ) + try: + payload = provider._build_payload("hello") + assert payload["messages"][0]["content"] == ( + "Please sound cheerful and lively. " + "Please use Sichuan dialect when speaking. " + "You are chatting with a close friend." + ) + finally: + asyncio.run(provider.terminate()) + + +def test_mimo_headers_use_single_authorization_method(): + assert build_headers("test-key") == { + "Content-Type": "application/json", + "Authorization": "Bearer test-key", + } + + +@pytest.mark.asyncio +async def test_mimo_tts_get_audio_handles_empty_choices(): + provider = _make_tts_provider() + + class _Response: + status_code = 200 + text = '{"choices":[]}' + + def raise_for_status(self): + return None + + def json(self): + return {"choices": []} + + provider.client = SimpleNamespace(post=_fake_post(_Response())) + + with pytest.raises(MiMoAPIError, match="returned no audio payload"): + await provider.get_audio("hello") + + +@pytest.mark.asyncio +async def test_mimo_stt_payload_includes_audio_and_prompt(monkeypatch): + provider = _make_stt_provider( + { + "mimo-stt-system-prompt": "system prompt", + "mimo-stt-user-prompt": "user prompt", + } + ) + + captured: dict = {} + + async def fake_prepare_audio_input(_audio_source: str): + return "ZmFrZQ==", [] + + class _Response: + status_code = 200 + text = '{"choices":[{"message":{"content":"transcribed text"}}]}' + + def raise_for_status(self): + return None + + def json(self): + return {"choices": [{"message": {"content": "transcribed text"}}]} + + async def fake_post(_url, headers=None, json=None): + captured["headers"] = headers + captured["json"] = json + return _Response() + + monkeypatch.setattr( + "astrbot.core.provider.sources.mimo_stt_api_source.prepare_audio_input", + fake_prepare_audio_input, + ) + provider.client = SimpleNamespace(post=fake_post) + + result = await provider.get_text("/tmp/test.wav") + + assert result == "transcribed text" + assert captured["json"]["messages"][0]["content"] == "system prompt" + assert captured["json"]["messages"][1]["content"][0]["type"] == "input_audio" + assert captured["json"]["messages"][1]["content"][0]["input_audio"]["data"] == "ZmFrZQ==" + assert captured["json"]["messages"][1]["content"][1]["text"] == "user prompt" + + +@pytest.mark.asyncio +async def test_mimo_stt_get_text_handles_empty_choices(monkeypatch): + provider = _make_stt_provider() + + async def fake_prepare_audio_input(_audio_source: str): + return "ZmFrZQ==", [] + + class _Response: + status_code = 200 + text = '{"choices":[]}' + + def raise_for_status(self): + return None + + def json(self): + return {"choices": []} + + monkeypatch.setattr( + "astrbot.core.provider.sources.mimo_stt_api_source.prepare_audio_input", + fake_prepare_audio_input, + ) + provider.client = SimpleNamespace(post=_fake_post(_Response())) + + with pytest.raises(MiMoAPIError, match="returned empty transcription"): + await provider.get_text("/tmp/test.wav") + + +def _fake_post(response): + async def _post(*_args, **_kwargs): + return response + + return _post From c395ff7b57664154778acc3bda327262686e559a Mon Sep 17 00:00:00 2001 From: Richard Liu <2645345468@qq.com> Date: Mon, 23 Mar 2026 00:22:49 +0800 Subject: [PATCH 4/5] fix: align mimo tts style payload with official docs --- astrbot/core/config/default.py | 6 +- .../provider/sources/mimo_tts_api_source.py | 56 ++++++++---- .../en-US/features/config-metadata.json | 6 +- .../ru-RU/features/config-metadata.json | 6 +- .../zh-CN/features/config-metadata.json | 6 +- tests/test_mimo_api_sources.py | 90 ++++++++++++++++--- 6 files changed, 129 insertions(+), 41 deletions(-) diff --git a/astrbot/core/config/default.py b/astrbot/core/config/default.py index a37e9eadf0..68d8050a8d 100644 --- a/astrbot/core/config/default.py +++ b/astrbot/core/config/default.py @@ -2368,17 +2368,17 @@ class ChatProviderTemplate(TypedDict): "mimo-tts-style-prompt": { "description": "风格提示词", "type": "string", - "hint": "用于控制生成语音的说话风格、语气或情绪,例如温柔、活泼、沉稳等。可留空。", + "hint": "会以 标签形式添加到待合成文本开头,用于控制语速、情绪、角色或风格,例如 开心、变快、孙悟空、悄悄话。可留空。", }, "mimo-tts-dialect": { "description": "方言", "type": "string", - "hint": "指定生成语音时使用的方言或口音,例如四川话、粤语口音等。可留空。", + "hint": "会与风格提示词一起写入开头的 标签中,例如 东北话、四川话、河南话、粤语。可留空。", }, "mimo-tts-seed-text": { "description": "种子文本", "type": "string", - "hint": "用于引导音色和说话方式的参考文本,会影响生成语音的表达风格。", + "hint": "作为可选的 user 消息发送,用于辅助调节语气和风格,不会拼接到待合成文本中。", }, "fishaudio-tts-character": { "description": "character", diff --git a/astrbot/core/provider/sources/mimo_tts_api_source.py b/astrbot/core/provider/sources/mimo_tts_api_source.py index daad55e1e6..2966bfb7d8 100644 --- a/astrbot/core/provider/sources/mimo_tts_api_source.py +++ b/astrbot/core/provider/sources/mimo_tts_api_source.py @@ -44,35 +44,53 @@ def __init__( self.set_model(provider_config.get("model", DEFAULT_MIMO_TTS_MODEL)) self.client = create_http_client(self.timeout, self.proxy) - def _build_user_prompt(self) -> str: - prompt_parts: list[str] = [] + def _build_user_prompt(self) -> str | None: + seed_text = self.seed_text.strip() + return seed_text or None + + def _build_style_prefix(self) -> str: + style_parts: list[str] = [] if self.style_prompt.strip(): - prompt_parts.append(self.style_prompt.strip()) + style_parts.append(self.style_prompt.strip()) if self.dialect.strip(): - prompt_parts.append(f"Please use {self.dialect.strip()} when speaking.") + style_parts.append(self.dialect.strip()) + + style_content = " ".join(style_parts).strip() + if not style_content: + return "" - if not prompt_parts: - return self.seed_text + # MiMo recommends using only the singing style tag at the very beginning. + if "唱歌" in style_content: + return "" - if self.seed_text.strip(): - prompt_parts.append(self.seed_text.strip()) + return f"" - return " ".join(prompt_parts) + def _build_assistant_content(self, text: str) -> str: + return f"{self._build_style_prefix()}{text}" def _build_payload(self, text: str) -> dict: - return { - "model": self.model_name, - "messages": [ + messages: list[dict[str, str]] = [] + + user_prompt = self._build_user_prompt() + if user_prompt: + messages.append( { "role": "user", - "content": self._build_user_prompt(), - }, - { - "role": "assistant", - "content": text, - }, - ], + "content": user_prompt, + } + ) + + messages.append( + { + "role": "assistant", + "content": self._build_assistant_content(text), + } + ) + + return { + "model": self.model_name, + "messages": messages, "audio": { "format": self.audio_format, "voice": self.voice, diff --git a/dashboard/src/i18n/locales/en-US/features/config-metadata.json b/dashboard/src/i18n/locales/en-US/features/config-metadata.json index 12b5af7f26..f78ad90e40 100644 --- a/dashboard/src/i18n/locales/en-US/features/config-metadata.json +++ b/dashboard/src/i18n/locales/en-US/features/config-metadata.json @@ -1409,15 +1409,15 @@ }, "mimo-tts-style-prompt": { "description": "Style prompt", - "hint": "Guides speaking style, tone, or emotion such as gentle, lively, or calm. Optional." + "hint": "Prepended to the synthesis target text as a tag to control speed, emotion, character, or style, such as happy, faster, Sun Wukong, or whispering. Optional." }, "mimo-tts-dialect": { "description": "Dialect", - "hint": "Target dialect or accent for generated speech, such as Sichuan dialect. Optional." + "hint": "Combined with the style prompt inside the leading tag, for example Northeastern Mandarin, Sichuan dialect, Henan dialect, or Cantonese. Optional." }, "mimo-tts-seed-text": { "description": "Seed text", - "hint": "Reference text used to guide voice characteristics and speaking style." + "hint": "Sent as an optional user message to help guide tone and speaking style. It is not appended to the synthesis target text." }, "fishaudio-tts-character": { "description": "character", diff --git a/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json b/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json index 6424a3f705..e8353d95a3 100644 --- a/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json +++ b/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json @@ -1414,15 +1414,15 @@ }, "mimo-tts-style-prompt": { "description": "Подсказка стиля", - "hint": "Задает стиль речи, тон или эмоцию, например мягкий, живой или спокойный. Необязательно." + "hint": "Добавляется в начало синтезируемого текста в виде тега и управляет скоростью, эмоцией, ролью или манерой речи. Необязательно." }, "mimo-tts-dialect": { "description": "Диалект", - "hint": "Диалект или акцент для синтезируемой речи, например сычуаньский диалект. Необязательно." + "hint": "Объединяется с подсказкой стиля внутри начального тега , например северо-восточный, сычуаньский, хэнаньский или кантонский вариант речи. Необязательно." }, "mimo-tts-seed-text": { "description": "Начальный текст", - "hint": "Эталонный текст, который помогает задать особенности голоса и манеру речи." + "hint": "Отправляется как необязательное user-сообщение для настройки тона и манеры речи. Не добавляется к самому тексту синтеза." }, "fishaudio-tts-character": { "description": "Персонаж", diff --git a/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json b/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json index bde8dab7e7..c18ac23be2 100644 --- a/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json +++ b/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json @@ -1411,15 +1411,15 @@ }, "mimo-tts-style-prompt": { "description": "风格提示词", - "hint": "用于控制生成语音的说话风格、语气或情绪,例如温柔、活泼、沉稳等。可留空。" + "hint": "会以 标签形式添加到待合成文本开头,用于控制语速、情绪、角色或风格,例如 开心、变快、孙悟空、悄悄话。可留空。" }, "mimo-tts-dialect": { "description": "方言", - "hint": "指定生成语音时使用的方言或口音,例如四川话、粤语口音等。可留空。" + "hint": "会与风格提示词一起写入开头的 标签中,例如 东北话、四川话、河南话、粤语。可留空。" }, "mimo-tts-seed-text": { "description": "种子文本", - "hint": "用于引导音色和说话方式的参考文本,会影响生成语音的表达风格。" + "hint": "作为可选的 user 消息发送,用于辅助调节语气和风格,不会拼接到待合成文本中。" }, "fishaudio-tts-character": { "description": "character", diff --git a/tests/test_mimo_api_sources.py b/tests/test_mimo_api_sources.py index d262f1c275..c2b02aa136 100644 --- a/tests/test_mimo_api_sources.py +++ b/tests/test_mimo_api_sources.py @@ -35,7 +35,7 @@ def _make_stt_provider(overrides: dict | None = None) -> ProviderMiMoSTTAPI: return ProviderMiMoSTTAPI(provider_config=provider_config, provider_settings={}) -def test_mimo_tts_prompt_returns_seed_text_when_no_style_or_dialect(): +def test_mimo_tts_user_prompt_returns_seed_text(): provider = _make_tts_provider() try: assert provider._build_user_prompt() == "seed text" @@ -43,21 +43,88 @@ def test_mimo_tts_prompt_returns_seed_text_when_no_style_or_dialect(): asyncio.run(provider.terminate()) -def test_mimo_tts_payload_includes_dialect_and_style_prompt(): +def test_mimo_tts_assistant_content_prefixes_style_and_dialect(): provider = _make_tts_provider( { - "mimo-tts-style-prompt": "Please sound cheerful and lively.", - "mimo-tts-dialect": "Sichuan dialect", + "mimo-tts-style-prompt": "开心", + "mimo-tts-dialect": "四川话", "mimo-tts-seed-text": "You are chatting with a close friend.", } ) try: payload = provider._build_payload("hello") - assert payload["messages"][0]["content"] == ( - "Please sound cheerful and lively. " - "Please use Sichuan dialect when speaking. " - "You are chatting with a close friend." - ) + assert payload["messages"][0] == { + "role": "user", + "content": "You are chatting with a close friend.", + } + assert payload["messages"][1]["content"] == "hello" + finally: + asyncio.run(provider.terminate()) + + +def test_mimo_tts_payload_omits_user_message_without_seed_text(): + provider = _make_tts_provider( + { + "mimo-tts-seed-text": "", + "mimo-tts-style-prompt": "开心", + } + ) + try: + payload = provider._build_payload("hello") + assert payload["messages"] == [ + { + "role": "assistant", + "content": "hello", + } + ] + finally: + asyncio.run(provider.terminate()) + + +def test_mimo_tts_singing_style_uses_single_style_tag(): + provider = _make_tts_provider( + { + "mimo-tts-style-prompt": "唱歌 开心", + "mimo-tts-dialect": "粤语", + } + ) + try: + payload = provider._build_payload("歌词") + assert payload["messages"][1]["content"] == "歌词" + finally: + asyncio.run(provider.terminate()) + + +def test_mimo_tts_plain_text_stays_in_assistant_message_when_no_style(): + provider = _make_tts_provider( + { + "mimo-tts-seed-text": "", + } + ) + try: + payload = provider._build_payload("hello") + assert payload["messages"] == [ + { + "role": "assistant", + "content": "hello", + } + ] + finally: + asyncio.run(provider.terminate()) + + +def test_mimo_tts_seed_text_is_not_prepended_to_assistant_content(): + provider = _make_tts_provider( + { + "mimo-tts-style-prompt": "开心", + "mimo-tts-seed-text": "reference text", + } + ) + try: + payload = provider._build_payload("明天就是周五了") + assert payload["messages"][0]["content"] == "reference text" + assert payload["messages"][1]["content"] == "明天就是周五了" + assert "reference text" not in payload["messages"][1]["content"] finally: asyncio.run(provider.terminate()) @@ -129,7 +196,10 @@ async def fake_post(_url, headers=None, json=None): assert result == "transcribed text" assert captured["json"]["messages"][0]["content"] == "system prompt" assert captured["json"]["messages"][1]["content"][0]["type"] == "input_audio" - assert captured["json"]["messages"][1]["content"][0]["input_audio"]["data"] == "ZmFrZQ==" + assert ( + captured["json"]["messages"][1]["content"][0]["input_audio"]["data"] + == "ZmFrZQ==" + ) assert captured["json"]["messages"][1]["content"][1]["text"] == "user prompt" From 49c835853a82f6df0d8068ba5890cdcad0be30b4 Mon Sep 17 00:00:00 2001 From: Richard Liu <2645345468@qq.com> Date: Tue, 31 Mar 2026 01:43:44 +0800 Subject: [PATCH 5/5] docs: add Xiaomi MiMo Omni and TTS services to multiple language READMEs --- README.md | 2 ++ README_fr.md | 2 ++ README_ja.md | 2 ++ README_ru.md | 2 ++ README_zh-TW.md | 2 ++ README_zh.md | 2 ++ 6 files changed, 12 insertions(+) diff --git a/README.md b/README.md index 469b8c4020..a901c5e137 100644 --- a/README.md +++ b/README.md @@ -184,6 +184,7 @@ Connect AstrBot to your favorite chat platform. | Coze | LLMOps Platforms | | OpenAI Whisper | Speech-to-Text Services | | SenseVoice | Speech-to-Text Services | +| Xiaomi MiMo Omni | Speech-to-Text Services | | OpenAI TTS | Text-to-Speech Services | | Gemini TTS | Text-to-Speech Services | | GPT-Sovits-Inference | Text-to-Speech Services | @@ -193,6 +194,7 @@ Connect AstrBot to your favorite chat platform. | Alibaba Cloud Bailian TTS | Text-to-Speech Services | | Azure TTS | Text-to-Speech Services | | Minimax TTS | Text-to-Speech Services | +| Xiaomi MiMo TTS | Text-to-Speech Services | | Volcano Engine TTS | Text-to-Speech Services | ## ❤️ Sponsors diff --git a/README_fr.md b/README_fr.md index 31ab1f3835..979ba62f67 100644 --- a/README_fr.md +++ b/README_fr.md @@ -184,6 +184,7 @@ Connectez AstrBot à vos plateformes de chat préférées. | Coze | Plateformes LLMOps | | OpenAI Whisper | Services de reconnaissance vocale | | SenseVoice | Services de reconnaissance vocale | +| Xiaomi MiMo Omni | Services de reconnaissance vocale | | OpenAI TTS | Services de synthèse vocale | | Gemini TTS | Services de synthèse vocale | | GPT-Sovits-Inference | Services de synthèse vocale | @@ -193,6 +194,7 @@ Connectez AstrBot à vos plateformes de chat préférées. | Alibaba Cloud Bailian TTS | Services de synthèse vocale | | Azure TTS | Services de synthèse vocale | | Minimax TTS | Services de synthèse vocale | +| Xiaomi MiMo TTS | Services de synthèse vocale | | Volcano Engine TTS | Services de synthèse vocale | ## ❤️ Contribuer diff --git a/README_ja.md b/README_ja.md index bf114b2d8a..0e6d2dba49 100644 --- a/README_ja.md +++ b/README_ja.md @@ -185,6 +185,7 @@ AstrBot をよく使うチャットプラットフォームに接続できます | Coze | LLMOps プラットフォーム | | OpenAI Whisper | 音声認識サービス | | SenseVoice | 音声認識サービス | +| Xiaomi MiMo Omni | 音声認識サービス | | OpenAI TTS | 音声合成サービス | | Gemini TTS | 音声合成サービス | | GPT-Sovits-Inference | 音声合成サービス | @@ -194,6 +195,7 @@ AstrBot をよく使うチャットプラットフォームに接続できます | Alibaba Cloud 百炼 TTS | 音声合成サービス | | Azure TTS | 音声合成サービス | | Minimax TTS | 音声合成サービス | +| Xiaomi MiMo TTS | 音声合成サービス | | Volcano Engine TTS | 音声合成サービス | ## ❤️ コントリビューション diff --git a/README_ru.md b/README_ru.md index ed309a9ee8..64d742c5a3 100644 --- a/README_ru.md +++ b/README_ru.md @@ -184,6 +184,7 @@ yay -S astrbot-git | Coze | Платформы LLMOps | | OpenAI Whisper | Сервисы распознавания речи | | SenseVoice | Сервисы распознавания речи | +| Xiaomi MiMo Omni | Сервисы распознавания речи | | OpenAI TTS | Сервисы синтеза речи | | Gemini TTS | Сервисы синтеза речи | | GPT-Sovits-Inference | Сервисы синтеза речи | @@ -193,6 +194,7 @@ yay -S astrbot-git | Alibaba Cloud Bailian TTS | Сервисы синтеза речи | | Azure TTS | Сервисы синтеза речи | | Minimax TTS | Сервисы синтеза речи | +| Xiaomi MiMo TTS | Сервисы синтеза речи | | Volcano Engine TTS | Сервисы синтеза речи | ## ❤️ Вклад в проект diff --git a/README_zh-TW.md b/README_zh-TW.md index 89cce05e7f..5bfea0d279 100644 --- a/README_zh-TW.md +++ b/README_zh-TW.md @@ -184,6 +184,7 @@ yay -S astrbot-git | Coze | LLMOps 平台 | | OpenAI Whisper | 語音轉文字服務 | | SenseVoice | 語音轉文字服務 | +| Xiaomi MiMo Omni | 語音轉文字服務 | | OpenAI TTS | 文字轉語音服務 | | Gemini TTS | 文字轉語音服務 | | GPT-Sovits-Inference | 文字轉語音服務 | @@ -193,6 +194,7 @@ yay -S astrbot-git | 阿里雲百煉 TTS | 文字轉語音服務 | | Azure TTS | 文字轉語音服務 | | Minimax TTS | 文字轉語音服務 | +| Xiaomi MiMo TTS | 文字轉語音服務 | | 火山引擎 TTS | 文字轉語音服務 | ## ❤️ 貢獻 diff --git a/README_zh.md b/README_zh.md index 41099b069c..c852033524 100644 --- a/README_zh.md +++ b/README_zh.md @@ -185,6 +185,7 @@ yay -S astrbot-git | Coze | LLMOps 平台 | | OpenAI Whisper | 语音转文本 | | SenseVoice | 语音转文本 | +| Xiaomi MiMo Omni | 语音转文本 | | OpenAI TTS | 文本转语音 | | Gemini TTS | 文本转语音 | | GPT-Sovits-Inference | 文本转语音 | @@ -194,6 +195,7 @@ yay -S astrbot-git | 阿里云百炼 TTS | 文本转语音 | | Azure TTS | 文本转语音 | | Minimax TTS | 文本转语音 | +| Xiaomi MiMo TTS | 文本转语音 | | 火山引擎 TTS | 文本转语音 | ## ❤️ 贡献