Unity 中的 AI 语音识别

简介
本教程将指导您使用 Hugging Face Unity API 在 Unity 游戏中实现最先进的语音识别。此功能可用于发出命令、与 NPC 对话、提高可访问性或任何其他将口语转换为文本可能很有用的功能。
如需在 Unity 中亲自尝试语音识别,请查看 itch.io 中的实时演示。
先决条件
本教程假设您对 Unity 有基本的了解。它还需要您安装 Hugging Face Unity API。有关设置 API 的说明,请查看我们的早期博客文章。
步骤
1. 设置场景
在本教程中,我们将设置一个非常简单的场景,玩家可以在其中开始和停止录音,结果将被转换为文本。
首先创建一个 Unity 项目,然后创建一个包含四个 UI 元素的画布:
- 开始按钮:这将开始录音。
- 停止按钮:这将停止录音。
- 文本 (TextMeshPro):语音识别的结果将显示在此处。
2. 设置脚本
创建一个名为 SpeechRecognitionTest
的脚本并将其附加到一个空的 GameObject 上。
在脚本中,定义对 UI 组件的引用:
[SerializeField] private Button startButton;
[SerializeField] private Button stopButton;
[SerializeField] private TextMeshProUGUI text;
在检查器中进行分配。
然后,使用 Start()
方法为开始和停止按钮设置监听器:
private void Start() {
startButton.onClick.AddListener(StartRecording);
stopButton.onClick.AddListener(StopRecording);
}
此时,您的脚本应该类似于:
using TMPro;
using UnityEngine;
using UnityEngine.UI;
public class SpeechRecognitionTest : MonoBehaviour {
[SerializeField] private Button startButton;
[SerializeField] private Button stopButton;
[SerializeField] private TextMeshProUGUI text;
private void Start() {
startButton.onClick.AddListener(StartRecording);
stopButton.onClick.AddListener(StopRecording);
}
private void StartRecording() {
}
private void StopRecording() {
}
}
3. 录制麦克风输入
现在让我们录制麦克风输入并将其编码为 WAV 格式。首先定义成员变量:
private AudioClip clip;
private byte[] bytes;
private bool recording;
然后,在 StartRecording()
中,使用 Microphone.Start()
方法开始录音:
private void StartRecording() {
clip = Microphone.Start(null, false, 10, 44100);
recording = true;
}
这将以 44100 Hz 录制长达 10 秒的音频。
如果录音达到最长 10 秒,我们希望自动停止录音。为此,在 Update()
方法中写入以下代码:
private void Update() {
if (recording && Microphone.GetPosition(null) >= clip.samples) {
StopRecording();
}
}
然后,在 StopRecording()
中,截断录音并将其编码为 WAV 格式:
private void StopRecording() {
var position = Microphone.GetPosition(null);
Microphone.End(null);
var samples = new float[position * clip.channels];
clip.GetData(samples, 0);
bytes = EncodeAsWAV(samples, clip.frequency, clip.channels);
recording = false;
}
最后,我们需要实现 EncodeAsWAV()
方法,以准备 Hugging Face API 的音频数据:
private byte[] EncodeAsWAV(float[] samples, int frequency, int channels) {
using (var memoryStream = new MemoryStream(44 + samples.Length * 2)) {
using (var writer = new BinaryWriter(memoryStream)) {
writer.Write("RIFF".ToCharArray());
writer.Write(36 + samples.Length * 2);
writer.Write("WAVE".ToCharArray());
writer.Write("fmt ".ToCharArray());
writer.Write(16);
writer.Write((ushort)1);
writer.Write((ushort)channels);
writer.Write(frequency);
writer.Write(frequency * channels * 2);
writer.Write((ushort)(channels * 2));
writer.Write((ushort)16);
writer.Write("data".ToCharArray());
writer.Write(samples.Length * 2);
foreach (var sample in samples) {
writer.Write((short)(sample * short.MaxValue));
}
}
return memoryStream.ToArray();
}
}
完整的脚本现在应该类似于:
using System.IO;
using TMPro;
using UnityEngine;
using UnityEngine.UI;
public class SpeechRecognitionTest : MonoBehaviour {
[SerializeField] private Button startButton;
[SerializeField] private Button stopButton;
[SerializeField] private TextMeshProUGUI text;
private AudioClip clip;
private byte[] bytes;
private bool recording;
private void Start() {
startButton.onClick.AddListener(StartRecording);
stopButton.onClick.AddListener(StopRecording);
}
private void Update() {
if (recording && Microphone.GetPosition(null) >= clip.samples) {
StopRecording();
}
}
private void StartRecording() {
clip = Microphone.Start(null, false, 10, 44100);
recording = true;
}
private void StopRecording() {
var position = Microphone.GetPosition(null);
Microphone.End(null);
var samples = new float[position * clip.channels];
clip.GetData(samples, 0);
bytes = EncodeAsWAV(samples, clip.frequency, clip.channels);
recording = false;
}
private byte[] EncodeAsWAV(float[] samples, int frequency, int channels) {
using (var memoryStream = new MemoryStream(44 + samples.Length * 2)) {
using (var writer = new BinaryWriter(memoryStream)) {
writer.Write("RIFF".ToCharArray());
writer.Write(36 + samples.Length * 2);
writer.Write("WAVE".ToCharArray());
writer.Write("fmt ".ToCharArray());
writer.Write(16);
writer.Write((ushort)1);
writer.Write((ushort)channels);
writer.Write(frequency);
writer.Write(frequency * channels * 2);
writer.Write((ushort)(channels * 2));
writer.Write((ushort)16);
writer.Write("data".ToCharArray());
writer.Write(samples.Length * 2);
foreach (var sample in samples) {
writer.Write((short)(sample * short.MaxValue));
}
}
return memoryStream.ToArray();
}
}
}
为了测试此代码是否正常工作,您可以将以下行添加到 StopRecording()
方法的末尾:
File.WriteAllBytes(Application.dataPath + "/test.wav", bytes);
现在,如果您单击 Start
按钮,对着麦克风说话,然后单击 Stop
,则您的 Unity Assets 文件夹中应该会保存一个名为 test.wav
的文件,其中包含您录制的音频。
4. 语音识别
接下来,我们将使用 Hugging Face Unity API 对编码音频运行语音识别。为此,我们将创建一个 SendRecording()
方法:
using HuggingFace.API;
private void SendRecording() {
HuggingFaceAPI.AutomaticSpeechRecognition(bytes, response => {
text.color = Color.white;
text.text = response;
}, error => {
text.color = Color.red;
text.text = error;
});
}
这将把编码音频发送到 API,如果成功则以白色显示响应,否则以红色显示错误消息。
不要忘记在 StopRecording()
方法的末尾调用 SendRecording()
:
private void StopRecording() {
/* other code */
SendRecording();
}
5. 最终润色
最后,让我们通过按钮交互性和状态消息来稍微改善此演示的用户体验。
开始和停止按钮应仅在适当的时候可交互,即当录音准备开始/停止时。
然后,在录音或等待 API 期间,将响应文本设置为简单的状态消息。
完成的脚本应类似于:
using System.IO;
using HuggingFace.API;
using TMPro;
using UnityEngine;
using UnityEngine.UI;
public class SpeechRecognitionTest : MonoBehaviour {
[SerializeField] private Button startButton;
[SerializeField] private Button stopButton;
[SerializeField] private TextMeshProUGUI text;
private AudioClip clip;
private byte[] bytes;
private bool recording;
private void Start() {
startButton.onClick.AddListener(StartRecording);
stopButton.onClick.AddListener(StopRecording);
stopButton.interactable = false;
}
private void Update() {
if (recording && Microphone.GetPosition(null) >= clip.samples) {
StopRecording();
}
}
private void StartRecording() {
text.color = Color.white;
text.text = "Recording...";
startButton.interactable = false;
stopButton.interactable = true;
clip = Microphone.Start(null, false, 10, 44100);
recording = true;
}
private void StopRecording() {
var position = Microphone.GetPosition(null);
Microphone.End(null);
var samples = new float[position * clip.channels];
clip.GetData(samples, 0);
bytes = EncodeAsWAV(samples, clip.frequency, clip.channels);
recording = false;
SendRecording();
}
private void SendRecording() {
text.color = Color.yellow;
text.text = "Sending...";
stopButton.interactable = false;
HuggingFaceAPI.AutomaticSpeechRecognition(bytes, response => {
text.color = Color.white;
text.text = response;
startButton.interactable = true;
}, error => {
text.color = Color.red;
text.text = error;
startButton.interactable = true;
});
}
private byte[] EncodeAsWAV(float[] samples, int frequency, int channels) {
using (var memoryStream = new MemoryStream(44 + samples.Length * 2)) {
using (var writer = new BinaryWriter(memoryStream)) {
writer.Write("RIFF".ToCharArray());
writer.Write(36 + samples.Length * 2);
writer.Write("WAVE".ToCharArray());
writer.Write("fmt ".ToCharArray());
writer.Write(16);
writer.Write((ushort)1);
writer.Write((ushort)channels);
writer.Write(frequency);
writer.Write(frequency * channels * 2);
writer.Write((ushort)(channels * 2));
writer.Write((ushort)16);
writer.Write("data".ToCharArray());
writer.Write(samples.Length * 2);
foreach (var sample in samples) {
writer.Write((short)(sample * short.MaxValue));
}
}
return memoryStream.ToArray();
}
}
}
恭喜,您现在可以在 Unity 中使用最先进的语音识别了!
如果您有任何问题或想更多地参与到 Hugging Face for Games 中,请加入 Hugging Face Discord!