基于上一篇:
https://blog.csdn.net/qq_17523181/article/details/148255809?spm=1001.2014.3001.5501
https://blog.csdn.net/qq_17523181/article/details/148264127?spm=1011.2415.3001.5331
讯飞默认的语音非常机械,更换为讯飞的超拟人语音
一、讯飞API
WebApi : wss://cbm01.cn-huabei-1.xf-yun.com/v1/private/mcd9m97e6
API地址:https://www.xfyun.cn/doc/spark/super%20smart-tts.html
- 示例
二、Unity编写连接脚本
注意1:超拟人语音是属于大模型版块,所以它的鉴权是大模型的鉴权逻辑
注意2:示例python使用的encoding是lame,在unity需要使用raw
- 建立XunfeiSmartTextToSpeech.cs文件
using System;
using System.Collections;
using System.Collections.Generic;
using System.Net.WebSockets;
using System.Security.Cryptography;
using System.Text;
using System.Threading;
using UnityEngine;
public class XunfeiSmartTextToSpeech : TTS
{
#region 参数
/// <summary>
/// 讯飞的应用设置
/// </summary>
[SerializeField]private XunfeiSettings m_XunfeiSettings;
/// <summary>
/// host地址
/// </summary>
[SerializeField] private string m_HostUrl = "cbm01.cn-huabei-1.xf-yun.com";
/// <summary>
/// 发音人
/// </summary>
[Header("选择朗读的声音")]
[SerializeField] private Speaker m_Vcn = Speaker.聆佑佑_童年女声;
/// <summary>
/// 音量,可选值:[0-100],默认为50
/// </summary>
[SerializeField] private int m_Volume = 50;
/// <summary>
/// 语音高,可选值:[0-100],默认为50
/// </summary>
[SerializeField] private int m_Pitch = 50;
/// <summary>
/// 语速,可选值:[0-100],默认为50
/// </summary>
[SerializeField] private int m_Speed = 50;
#endregion
private void Awake()
{
m_XunfeiSettings = this.GetComponent<XunfeiSettings>();
m_PostURL= "wss://cbm01.cn-huabei-1.xf-yun.com/v1/private/mcd9m97e6";
}
/// <summary>
/// 语音合成,返回合成文本
/// </summary>
/// <param name="_msg"></param>
/// <param name="_callback"></param>
public override void Speak(string _msg, Action<AudioClip, string> _callback)
{
StartCoroutine(GetSpeech(_msg, _callback));
}
/// <summary>
/// websocket
/// </summary>
private ClientWebSocket m_WebSocket;
private CancellationToken m_CancellationToken;
private AudioClip _audioClip;
#region 获取鉴权Url
/// <summary>
/// 获取鉴权url
/// </summary>
/// <returns></returns>
private string GetAuthUrl()
{
string date = DateTime.UtcNow.ToString("r");
Uri uri = new Uri(m_PostURL);
Debug.Log(uri);
StringBuilder builder = new StringBuilder("host: ").Append(uri.Host).Append("\n").//
Append("date: ").Append(date).Append("\n").//
Append("GET ").Append(uri.LocalPath).Append(" HTTP/1.1");
string sha = HMACsha256(m_XunfeiSettings.m_APISecret, builder.ToString());
string authorization = string.Format("api_key=\"{0}\", algorithm=\"{1}\", headers=\"{2}\", signature=\"{3}\"", m_XunfeiSettings.m_APIKey, "hmac-sha256", "host date request-line", sha);
string NewUrl = "wss://" + uri.Host + uri.LocalPath;
string path1 = "authorization" + "=" + Convert.ToBase64String(System.Text.Encoding.UTF8.GetBytes(authorization));
date = date.Replace(" ", "%20").Replace(":", "%3A").Replace(",", "%2C");
string path2 = "date" + "=" + date;
string path3 = "host" + "=" + uri.Host;
NewUrl = NewUrl + "?" + path1 + "&" + path2 + "&" + path3;
Debug.Log("NewUrl");
Debug.Log(NewUrl);
return NewUrl;
}
public string HMACsha256(string apiSecretIsKey, string buider)
{
byte[] bytes = System.Text.Encoding.UTF8.GetBytes(apiSecretIsKey);
System.Security.Cryptography.HMACSHA256 hMACSHA256 = new System.Security.Cryptography.HMACSHA256(bytes);
byte[] date = System.Text.Encoding.UTF8.GetBytes(buider);
date = hMACSHA256.ComputeHash(date);
hMACSHA256.Clear();
return Convert.ToBase64String(date);
}
#endregion
#region 语音合成
/// <summary>
/// 音频长度
/// </summary>
private int m_AudioLenth;
/// <summary>
/// 数据队列
/// </summary>
Queue<float> m_AudioQueue = new Queue<float>();
/// <summary>
/// 获取语音合成
/// </summary>
/// <param name="_text"></param>
/// <param name="_callback"></param>
/// <returns></returns>
public IEnumerator GetSpeech(string _text, Action<AudioClip, string> _callback)
{
stopwatch.Restart();
yield return null;
if (m_WebSocket != null) { m_WebSocket.Abort(); }
ConnectHost(_text);
_audioClip = AudioClip.Create("audio", 16000 * 60, 1, 16000, true, OnAudioRead);
//回调
_callback(_audioClip, _text);
stopwatch.Stop();
UnityEngine.Debug.Log("讯飞超拟人语音合成耗时:" + stopwatch.Elapsed.TotalSeconds);
}
void OnAudioRead(float[] data)
{
for (int i = 0; i < data.Length; i++)
{
if (m_AudioQueue.Count > 0)
{
data[i] = m_AudioQueue.Dequeue();
}
else
{
if (m_WebSocket == null || m_WebSocket.State != WebSocketState.Aborted) m_AudioLenth++;
data[i] = 0;
}
}
}
/// <summary>
/// 连接服务器,合成语音
/// </summary>
private async void ConnectHost(string text)
{
try
{
//text = "你好啊,你是谁呀,一起来玩吧";
m_WebSocket = new ClientWebSocket();
m_CancellationToken = new CancellationToken();
Uri uri = new Uri(GetAuthUrl());
Debug.Log(uri);
await m_WebSocket.ConnectAsync(uri, m_CancellationToken);
text = Convert.ToBase64String(Encoding.UTF8.GetBytes(text));
//发送的数据
string _jsonData = TTSRequestBuilder.BuildTTSRequest(
appId: m_XunfeiSettings.m_AppID,
headerStatus: 2,
vcn: GetVcn(m_Vcn),
volume: m_Volume,
speed: m_Speed,
pitch: m_Pitch,
payloadStatus: 2,
payloadText: text
);
await m_WebSocket.SendAsync(new ArraySegment<byte>(Encoding.UTF8.GetBytes(_jsonData)), WebSocketMessageType.Binary, true, m_CancellationToken); //发送数据
StringBuilder sb = new StringBuilder();
//播放队列.Clear();
while (m_WebSocket.State == WebSocketState.Open)
{
var result = new byte[4096];
await m_WebSocket.ReceiveAsync(new ArraySegment<byte>(result), m_CancellationToken);//接受数据
List<byte> list = new List<byte>(result); while (list[list.Count - 1] == 0x00) list.RemoveAt(list.Count - 1);//去除空字节
var str = Encoding.UTF8.GetString(list.ToArray());
sb.Append(str);
if (str.EndsWith("}"))
{
//获取返回的数据
ResponseData _responseData = JsonUtility.FromJson<ResponseData>(sb.ToString());
sb.Clear();
if (_responseData.header.code != 0)
{
//返回错误
PrintErrorLog(_responseData.header.code);
m_WebSocket.Abort();
break;
}
if (_responseData.header.status != 0)
{
byte[] audioBytes = Convert.FromBase64String(_responseData.payload.audio.audio);
float[] audioData = ConvertByteToFloat(audioBytes);
lock (m_AudioQueue)
{
foreach (float f in audioData) m_AudioQueue.Enqueue(f);
}
if (_responseData.header.status == 2)
{
m_WebSocket.Abort();
break;
}
}
}
}
}
catch (Exception ex)
{
Debug.LogError("报错信息: " + ex.Message);
m_WebSocket.Dispose();
}
}
float[] ConvertByteToFloat(byte[] byteArray)
{
// 假设是16位PCM数据
float[] floatArray = new float[byteArray.Length / 2];
for (int i = 0; i < floatArray.Length; i++)
{
short sample = (short)((byteArray[i * 2 + 1] << 8) | byteArray[i * 2]);
floatArray[i] = sample / 32768.0f;
}
return floatArray;
}
public static byte[] Base64Decode(string base64String)
{
try
{
return Convert.FromBase64String(base64String);
}
catch (FormatException ex)
{
Debug.LogError($"Base64 解码失败: {ex.Message}");
return null;
}
}
#endregion
#region 工具方法
/// <summary>
/// 打印错误日志
/// </summary>
/// <param name="status"></param>
private void PrintErrorLog(int status)
{
switch (status) {
case 10009:
Debug.LogError("输入数据非法 / 检查输入数据");
return;
case 10010:
Debug.LogError("没有授权许可或授权数已满 / 提交工单");
return;
case 10019:
Debug.LogError("session超时 / 检查是否数据发送完毕但未关闭连接");
return;
case 10043:
Debug.LogError("音频解码失败 / 检查aue参数,如果为speex,请确保音频是speex音频并分段压缩且与帧大小一致");
return;
case 10114:
Debug.LogError("session 超时 / 会话时间超时,检查是否发送数据时间超过了60s");
return;
case 10139:
Debug.LogError("参数错误 / 检查参数是否正确");
return;
case 10160:
Debug.LogError("请求数据格式非法 / 检查请求数据是否是合法的json");
return;
case 10161:
Debug.LogError("base64解码失败 / 检查发送的数据是否使用base64编码了");
return;
case 10163:
Debug.LogError("参数校验失败 / 具体原因见详细的描述");
return;
case 10200:
Debug.LogError("读取数据超时 / 检查是否累计10s未发送数据并且未关闭连接");
return;
case 10222:
Debug.LogError("1.上传的数据超过了接口上限; 2.SSL证书无效; / 1.检查接口上传的数据(文本、音频、图片等)是否超越了接口的最大限制,可到相应的接口文档查询具体的上限; 2. 请将log导出发到工单");
return;
case 10223:
Debug.LogError("lb 找不到节点 / 提交工单");
return;
case 10313:
Debug.LogError("appid和apikey不匹配 / 检查appid是否合法");
return;
case 10317:
Debug.LogError("版本非法 / 请到控制台提交工单联系技术人员");
return;
case 10700:
Debug.LogError("引擎异常 / 按照报错原因的描述,对照开发文档检查输入输出,如果仍然无法排除问题,请提供sid以及接口返回的错误信息,到控制台提交工单联系技术人员排查。");
return;
case 11200:
Debug.LogError("功能未授权 / 请先检查appid是否正确,并且确保该appid下添加了相关服务。若没问题,则按照如下方法排查。 1. 确认总调用量是否已超越限制,或者总次数授权已到期,若已超限或者已过期请联系商务人员。 2. 查看是否使用了未授权的功能,或者授权已过期。");
return;
case 11201:
Debug.LogError("该APPID的每日交互次数超过限制 / 根据自身情况提交应用审核进行服务量提额,或者联系商务购买企业级正式接口,获得海量服务量权限以便商用。");
return;
case 11503:
Debug.LogError("服务内部响应数据错误 / 提交工单");
return;
case 11502:
Debug.LogError("服务配置错误 / 提交工单");
return;
}
if (status >= 100001 && status <= 100010) {
Debug.LogError("调用引擎时出现错误 / 请根据message中包含的errno前往 5.2引擎错误码 查看对应的说明及处理策略");
return;
}
Debug.LogError("平台未知错误,错误代码:" + status);
}
/// <summary>
/// byte[]数组转化为AudioClip可读取的float[]类型
/// </summary>
/// <param name="byteArray"></param>
/// <returns></returns>
public float[] BytesToFloat(byte[] byteArray)
{
float[] sounddata = new float[byteArray.Length / 2];
for (int i = 0; i < sounddata.Length; i++)
{
sounddata[i] = BytesToFloat(byteArray[i * 2], byteArray[i * 2 + 1]);
}
return sounddata;
}
private float BytesToFloat(byte firstByte, byte secondByte)
{
//小端和大端顺序要调整
short s;
if (BitConverter.IsLittleEndian)
s = (short)((secondByte << 8) | firstByte);
else
s = (short)((firstByte << 8) | secondByte);
// convert to range from -1 to (just below) 1
return s / 32768.0F;
}
#endregion
#region 数据定义
public class TTSRequestBuilder
{
public static string BuildTTSRequest(
string appId,
int headerStatus,
string vcn,
int volume,
int speed,
int pitch,
int payloadStatus,
string payloadText)
{
// 创建请求对象
var request = new TTSRequest
{
header = new Header
{
app_id = appId,
status = headerStatus
},
parameter = new Parameter
{
tts = new TTS
{
vcn = vcn,
volume = volume,
speed = speed,
pitch = pitch,
audio = new Audio() // 使用默认值
}
},
payload = new Payload
{
text = new Text
{
status = payloadStatus,
text = payloadText
}
}
};
// 序列化为JSON
return JsonUtility.ToJson(request, true);
}
}
[System.Serializable]
public class TTSRequest
{
public Header header;
public Parameter parameter;
public Payload payload;
}
[System.Serializable]
public class Header
{
public string app_id;
public int status;
}
[System.Serializable]
public class Parameter
{
public TTS tts;
}
[System.Serializable]
public class TTS
{
public string vcn;
public int volume;
public int rhy = 0; // 默认值
public int speed;
public int pitch;
public int bgs = 0; // 默认值
public int reg = 0; // 默认值
public int rdn = 0; // 默认值
public Audio audio;
}
[System.Serializable]
public class Audio
{
public string encoding = "raw"; // 默认值
public int sample_rate = 24000; // 默认值
public int channels = 1; // 默认值
public int bit_depth = 16; // 默认值
public int frame_size = 0; // 默认值
}
[System.Serializable]
public class Payload
{
public Text text;
}
[System.Serializable]
public class Text
{
public string encoding = "utf8"; // 默认值
public string compress = "raw"; // 默认值
public string format = "plain"; // 默认值
public int status;
public int seq = 0; // 默认值
public string text;
}
/// <summary>
/// 获取数据
/// </summary>
[Serializable]
public class ResponseData
{
public ResHeader header;
public ResPayload payload;
public string message;
}
[Serializable]
public class ResHeader
{
public int code;
public string message;
public string sid;
public int status;
}
[Serializable]
public class ResPayload
{
public ResAudio audio;
public ResPybuf pybuf;
}
[Serializable]
public class ResAudio
{
public string encoding;
public int sample_rate;
public int channels;
public int bit_depth;
public int status;
public int seq;
public int frame_size;
public string audio;
}
[Serializable]
public class ResPybuf
{
public string encoding;
public string compress;
public string format;
public int status;
public int seq;
public string text;
}
#endregion
#region 设置项
public enum Speaker
{
聆飞逸_男声,
聆小璇_女声,
聆佑佑_童年女声,
聆玉昭_女声,
聆小璃_女声,
聆飞哲_男声,
聆小玥_女声,
聆玉言_女声,
聆小琪_女声
}
/// <summary>
/// 设置声音
/// </summary>
/// <param name="_speeker"></param>
/// <returns></returns>
private string GetVcn(Speaker _speeker)
{
if (_speeker == Speaker.聆飞逸_男声)
{
return "x5_lingfeiyi_flow";
}
else if (_speeker == Speaker.聆小璇_女声)
{
return "x4_lingxiaoxuan_oral";
}
else if (_speeker == Speaker.聆佑佑_童年女声)
{
return "x4_lingyouyou_oral";
}
else if (_speeker == Speaker.聆玉昭_女声)
{
return "x4_lingyuzhao_oral";
}
else if (_speeker == Speaker.聆小璃_女声)
{
return "x4_lingxiaoli_oral";
}
else if (_speeker == Speaker.聆飞哲_男声)
{
return "x4_lingfeizhe_oral";
}
else if (_speeker == Speaker.聆小玥_女声)
{
return "x5_lingxiaoyue_flow";
}
else if (_speeker == Speaker.聆玉言_女声)
{
return "x5_lingyuyan_flow";
}
else if (_speeker == Speaker.聆小琪_女声)
{
return "x4_lingxiaoqi_oral";
}
return "x5_lingfeiyi_flow";
}
#endregion
}
- 同样的方法,填写好API后,绑定起来就可以使用了