Skip to content

[Feature] 添加阿里云百炼语音识别 (DashScope STT) 支持,并建议重构 DashScope TTS 共用公共逻辑 #8271

@sch-chun

Description

@sch-chun

背景

目前 AstrBot 已经集成了阿里云百炼的语音合成(TTS)功能(dashscope_tts),但语音识别(STT)能力仍然缺失。希望使用阿里云百炼的语音转文字服务(例如 qwen3-asr-flash 模型)来实现语音交互等场景。

新增功能

本 Issue 附带了一个初步尝试但完整可用的 dashscope_stt.py 适配器,实现了 STTProvider 接口,使用千问3-ASR-Flash 模型进行非实时语音转文字。

代码文件

请将以下文件保存至 astrbot/core/provider/sources/dashscope_stt.py

import os
from typing import Any, Optional

from astrbot.core import logger
from astrbot.core.provider.entities import ProviderType
from astrbot.core.provider.provider import STTProvider
from astrbot.core.provider.register import register_provider_adapter

# 尝试导入 dashscope SDK
try:
    import dashscope
    from dashscope import MultiModalConversation
    from dashscope.api_entities.dashscope_response import MultiModalConversationResponse

    DASHSCOPE_AVAILABLE = True
except ImportError:
    DASHSCOPE_AVAILABLE = False
    dashscope = None      # type: ignore
    MultiModalConversation = None   # type: ignore
    MultiModalConversationResponse = None

# 默认配置
DEFAULT_DASHSCOPE_API_BASE = "https://dashscope.aliyuncs.com/api/v1"
DEFAULT_DASHSCOPE_MODEL = "qwen3-asr-flash"
DEFAULT_ASR_OPTIONS = {
    "enable_itn": False,
}


@register_provider_adapter(
    "dashscope_stt",
    "阿里云百炼语音识别 (DashScope)",
    provider_type=ProviderType.SPEECH_TO_TEXT,
    default_config_tmpl={
        "api_key": "",
        "model": DEFAULT_DASHSCOPE_MODEL,
        "api_base": DEFAULT_DASHSCOPE_API_BASE,
        "enable_itn": False,
        "language": "",
        "timeout": 60,
    },
    provider_display_name="阿里云百炼语音识别",
)
class ProviderDashScopeSTT(STTProvider):
    """基于阿里云百炼 DashScope SDK 的非实时语音识别适配器 (千问3-ASR-Flash)"""

    def __init__(self, provider_config: dict, provider_settings: dict) -> None:
        super().__init__(provider_config, provider_settings)
        if not DASHSCOPE_AVAILABLE:
            raise ImportError(
                "dashscope 未安装,请运行 `pip install dashscope` 以使用阿里云百炼语音识别"
            )

        assert dashscope is not None
        assert MultiModalConversation is not None

        self.api_key = provider_config.get("api_key", "")
        if not self.api_key:
            raise ValueError("dashscope_stt 配置缺少 api_key")

        self.model = provider_config.get("model", DEFAULT_DASHSCOPE_MODEL)
        self.api_base = provider_config.get("api_base", DEFAULT_DASHSCOPE_API_BASE)
        self.enable_itn = provider_config.get("enable_itn", False)
        self.language = provider_config.get("language", "")
        self.timeout = provider_config.get("timeout", 60)

        dashscope.api_key = self.api_key
        dashscope.base_http_api_url = self.api_base

        self.set_model(self.model)

    def _build_asr_options(self) -> dict:
        options = {"enable_itn": self.enable_itn}
        if self.language and self.language.strip():
            options["language"] = self.language.strip()
        return options

    def _normalize_audio_url(self, audio_url: str) -> str:
        audio_url = audio_url.strip()
        if audio_url.startswith(("http://", "https://", "data:", "file://")):
            return audio_url
        if os.path.isabs(audio_url):
            return f"file://{audio_url}"
        abs_path = os.path.abspath(audio_url)
        return f"file://{abs_path}"

    async def get_text(self, audio_url: str) -> str:
        normalized_audio_url = self._normalize_audio_url(audio_url)
        logger.debug(f"DashScope STT 识别请求: {normalized_audio_url}")

        messages = [{"role": "user", "content": [{"audio": normalized_audio_url}]}]
        asr_options = self._build_asr_options()

        assert MultiModalConversation is not None

        try:
            response: Any = MultiModalConversation.call(
                model=self.model,
                messages=messages,
                result_format="message",
                asr_options=asr_options,
                timeout=self.timeout,
            )

            if response.status_code != 200:
                error_msg = (
                    f"DashScope STT 请求失败: status_code={response.status_code}, "
                    f"message={response.message}"
                )
                logger.error(error_msg)
                raise Exception(error_msg)

            output = response.output
            if not output or not output.get("choices"):
                raise Exception(f"DashScope STT 响应格式异常: {response}")

            choice = output["choices"][0]
            message = choice.get("message", {})
            content_list = message.get("content", [])
            if not content_list:
                raise Exception("识别结果为空")

            text = content_list[0].get("text", "").strip()
            if not text:
                raise Exception("识别文本为空")

            logger.debug(f"DashScope STT 识别成功: {text[:50]}...")
            return text

        except Exception as e:
            logger.error(f"DashScope STT 调用异常: {e}")
            raise

    async def terminate(self) -> None:
        pass

    async def test(self) -> None:
        test_audio_url = "https://dashscope.oss-cn-beijing.aliyuncs.com/audios/welcome.mp3"
        result = await self.get_text(test_audio_url)
        if not result:
            raise Exception("测试识别结果为空")
        logger.info(f"DashScope STT 测试通过,识别结果: {result[:50]}")

配套的前端与配置修改

  1. 后端默认配置模板:在 astrbot/core/config/default.pyprovider_group.metadata.provider.config_template 中添加 "阿里云百炼 STT(API)" 模板。例:
                    "dashscope_stt.api_key": {
                        "description": "API Key",
                        "type": "string",
                        "hint": "从阿里云百炼控制台获取的 API Key。",
                    },
                    "dashscope_stt.model": {
                        "description": "模型名称",
                        "type": "string",
                        "hint": "默认为 qwen3-asr-flash。",
                    },
                    "dashscope_stt.api_base": {
                        "description": "API Base URL",
                        "type": "string",
                        "hint": "北京地域:https://dashscope.aliyuncs.com/api/v1;新加坡地域:https://dashscope-intl.aliyuncs.com/api/v1",
                    },
                    "dashscope_stt.enable_itn": {
                        "description": "启用逆文本正则化",
                        "type": "bool",
                        "hint": "是否将识别结果中的数字、金额等转换为标准格式。",
                    },
                    "dashscope_stt.language": {
                        "description": "语言代码(可选)",
                        "type": "string",
                        "hint": "例如 zh, en, ja 等,留空则自动检测。",
                    },
                    "dashscope_stt.timeout": {
                        "description": "请求超时时间(秒)",
                        "type": "int",
                        "hint": "默认 60 秒。",
                    },
  1. 前端类型映射:在 dashboard/src/composables/useProviderSources.tsoldVersionProviderTypeMapping 中添加:
    dashscope_stt: 'speech_to_text',

测试状态

  • 已通过本地测试,使用官方示例音频可以正确返回识别文本。
  • 适配器已正确注册并在 WebUI 中显示(完成上述前端修改后)。
  • 上传语音音频文件能正确返回识别结果。

重构建议:抽提 DashScope 公共逻辑

目前项目中已存在 dashscope_tts.py(TTS 适配器),与新增的 dashscope_stt.py 可能存在重复代码。

提议:创建 dashscope_common.py,抽提公共函数,然后让 dashscope_stt.pydashscope_tts.py 共同使用这些公共函数。这样既能减少代码冗余,也便于未来增加新的 DashScope 能力。

Use Case / 使用场景

No response

Willing to Submit PR? / 是否愿意提交PR?

  • Yes, I am willing to submit a PR. / 是的,我愿意提交 PR。

Code of Conduct

Metadata

Metadata

Assignees

No one assigned

    Labels

    area:providerThe bug / feature is about AI Provider, Models, LLM Agent, LLM Agent Runner.enhancementNew feature or request

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions