Unity 中的 AI 語音識別

簡介
本教程將指導您使用 Hugging Face Unity API 在 Unity 遊戲中實現最先進的語音識別。此功能可用於發出命令、與 NPC 對話、提高可訪問性或任何其他將口語轉換為文字可能很有用的功能。
如需在 Unity 中親自嘗試語音識別,請檢視 itch.io 中的即時演示。
先決條件
本教程假設您對 Unity 有基本的瞭解。它還需要您安裝 Hugging Face Unity API。有關設定 API 的說明,請檢視我們的早期部落格文章。
步驟
1. 設定場景
在本教程中,我們將設定一個非常簡單的場景,玩家可以在其中開始和停止錄音,結果將被轉換為文字。
首先建立一個 Unity 專案,然後建立一個包含四個 UI 元素的畫布:
- 開始按鈕:這將開始錄音。
- 停止按鈕:這將停止錄音。
- 文字 (TextMeshPro):語音識別的結果將顯示在此處。
2. 設定指令碼
建立一個名為 SpeechRecognitionTest
的指令碼並將其附加到一個空的 GameObject 上。
在指令碼中,定義對 UI 元件的引用:
[SerializeField] private Button startButton;
[SerializeField] private Button stopButton;
[SerializeField] private TextMeshProUGUI text;
在檢查器中進行分配。
然後,使用 Start()
方法為開始和停止按鈕設定監聽器:
private void Start() {
startButton.onClick.AddListener(StartRecording);
stopButton.onClick.AddListener(StopRecording);
}
此時,您的指令碼應該類似於:
using TMPro;
using UnityEngine;
using UnityEngine.UI;
public class SpeechRecognitionTest : MonoBehaviour {
[SerializeField] private Button startButton;
[SerializeField] private Button stopButton;
[SerializeField] private TextMeshProUGUI text;
private void Start() {
startButton.onClick.AddListener(StartRecording);
stopButton.onClick.AddListener(StopRecording);
}
private void StartRecording() {
}
private void StopRecording() {
}
}
3. 錄製麥克風輸入
現在讓我們錄製麥克風輸入並將其編碼為 WAV 格式。首先定義成員變數:
private AudioClip clip;
private byte[] bytes;
private bool recording;
然後,在 StartRecording()
中,使用 Microphone.Start()
方法開始錄音:
private void StartRecording() {
clip = Microphone.Start(null, false, 10, 44100);
recording = true;
}
這將以 44100 Hz 錄製長達 10 秒的音訊。
如果錄音達到最長 10 秒,我們希望自動停止錄音。為此,在 Update()
方法中寫入以下程式碼:
private void Update() {
if (recording && Microphone.GetPosition(null) >= clip.samples) {
StopRecording();
}
}
然後,在 StopRecording()
中,截斷錄音並將其編碼為 WAV 格式:
private void StopRecording() {
var position = Microphone.GetPosition(null);
Microphone.End(null);
var samples = new float[position * clip.channels];
clip.GetData(samples, 0);
bytes = EncodeAsWAV(samples, clip.frequency, clip.channels);
recording = false;
}
最後,我們需要實現 EncodeAsWAV()
方法,以準備 Hugging Face API 的音訊資料:
private byte[] EncodeAsWAV(float[] samples, int frequency, int channels) {
using (var memoryStream = new MemoryStream(44 + samples.Length * 2)) {
using (var writer = new BinaryWriter(memoryStream)) {
writer.Write("RIFF".ToCharArray());
writer.Write(36 + samples.Length * 2);
writer.Write("WAVE".ToCharArray());
writer.Write("fmt ".ToCharArray());
writer.Write(16);
writer.Write((ushort)1);
writer.Write((ushort)channels);
writer.Write(frequency);
writer.Write(frequency * channels * 2);
writer.Write((ushort)(channels * 2));
writer.Write((ushort)16);
writer.Write("data".ToCharArray());
writer.Write(samples.Length * 2);
foreach (var sample in samples) {
writer.Write((short)(sample * short.MaxValue));
}
}
return memoryStream.ToArray();
}
}
完整的指令碼現在應該類似於:
using System.IO;
using TMPro;
using UnityEngine;
using UnityEngine.UI;
public class SpeechRecognitionTest : MonoBehaviour {
[SerializeField] private Button startButton;
[SerializeField] private Button stopButton;
[SerializeField] private TextMeshProUGUI text;
private AudioClip clip;
private byte[] bytes;
private bool recording;
private void Start() {
startButton.onClick.AddListener(StartRecording);
stopButton.onClick.AddListener(StopRecording);
}
private void Update() {
if (recording && Microphone.GetPosition(null) >= clip.samples) {
StopRecording();
}
}
private void StartRecording() {
clip = Microphone.Start(null, false, 10, 44100);
recording = true;
}
private void StopRecording() {
var position = Microphone.GetPosition(null);
Microphone.End(null);
var samples = new float[position * clip.channels];
clip.GetData(samples, 0);
bytes = EncodeAsWAV(samples, clip.frequency, clip.channels);
recording = false;
}
private byte[] EncodeAsWAV(float[] samples, int frequency, int channels) {
using (var memoryStream = new MemoryStream(44 + samples.Length * 2)) {
using (var writer = new BinaryWriter(memoryStream)) {
writer.Write("RIFF".ToCharArray());
writer.Write(36 + samples.Length * 2);
writer.Write("WAVE".ToCharArray());
writer.Write("fmt ".ToCharArray());
writer.Write(16);
writer.Write((ushort)1);
writer.Write((ushort)channels);
writer.Write(frequency);
writer.Write(frequency * channels * 2);
writer.Write((ushort)(channels * 2));
writer.Write((ushort)16);
writer.Write("data".ToCharArray());
writer.Write(samples.Length * 2);
foreach (var sample in samples) {
writer.Write((short)(sample * short.MaxValue));
}
}
return memoryStream.ToArray();
}
}
}
為了測試此程式碼是否正常工作,您可以將以下行新增到 StopRecording()
方法的末尾:
File.WriteAllBytes(Application.dataPath + "/test.wav", bytes);
現在,如果您單擊 Start
按鈕,對著麥克風說話,然後單擊 Stop
,則您的 Unity Assets 資料夾中應該會儲存一個名為 test.wav
的檔案,其中包含您錄製的音訊。
4. 語音識別
接下來,我們將使用 Hugging Face Unity API 對編碼音訊執行語音識別。為此,我們將建立一個 SendRecording()
方法:
using HuggingFace.API;
private void SendRecording() {
HuggingFaceAPI.AutomaticSpeechRecognition(bytes, response => {
text.color = Color.white;
text.text = response;
}, error => {
text.color = Color.red;
text.text = error;
});
}
這將把編碼音訊傳送到 API,如果成功則以白色顯示響應,否則以紅色顯示錯誤訊息。
不要忘記在 StopRecording()
方法的末尾呼叫 SendRecording()
:
private void StopRecording() {
/* other code */
SendRecording();
}
5. 最終潤色
最後,讓我們透過按鈕互動性和狀態訊息來稍微改善此演示的使用者體驗。
開始和停止按鈕應僅在適當的時候可互動,即當錄音準備開始/停止時。
然後,在錄音或等待 API 期間,將響應文字設定為簡單的狀態訊息。
完成的指令碼應類似於:
using System.IO;
using HuggingFace.API;
using TMPro;
using UnityEngine;
using UnityEngine.UI;
public class SpeechRecognitionTest : MonoBehaviour {
[SerializeField] private Button startButton;
[SerializeField] private Button stopButton;
[SerializeField] private TextMeshProUGUI text;
private AudioClip clip;
private byte[] bytes;
private bool recording;
private void Start() {
startButton.onClick.AddListener(StartRecording);
stopButton.onClick.AddListener(StopRecording);
stopButton.interactable = false;
}
private void Update() {
if (recording && Microphone.GetPosition(null) >= clip.samples) {
StopRecording();
}
}
private void StartRecording() {
text.color = Color.white;
text.text = "Recording...";
startButton.interactable = false;
stopButton.interactable = true;
clip = Microphone.Start(null, false, 10, 44100);
recording = true;
}
private void StopRecording() {
var position = Microphone.GetPosition(null);
Microphone.End(null);
var samples = new float[position * clip.channels];
clip.GetData(samples, 0);
bytes = EncodeAsWAV(samples, clip.frequency, clip.channels);
recording = false;
SendRecording();
}
private void SendRecording() {
text.color = Color.yellow;
text.text = "Sending...";
stopButton.interactable = false;
HuggingFaceAPI.AutomaticSpeechRecognition(bytes, response => {
text.color = Color.white;
text.text = response;
startButton.interactable = true;
}, error => {
text.color = Color.red;
text.text = error;
startButton.interactable = true;
});
}
private byte[] EncodeAsWAV(float[] samples, int frequency, int channels) {
using (var memoryStream = new MemoryStream(44 + samples.Length * 2)) {
using (var writer = new BinaryWriter(memoryStream)) {
writer.Write("RIFF".ToCharArray());
writer.Write(36 + samples.Length * 2);
writer.Write("WAVE".ToCharArray());
writer.Write("fmt ".ToCharArray());
writer.Write(16);
writer.Write((ushort)1);
writer.Write((ushort)channels);
writer.Write(frequency);
writer.Write(frequency * channels * 2);
writer.Write((ushort)(channels * 2));
writer.Write((ushort)16);
writer.Write("data".ToCharArray());
writer.Write(samples.Length * 2);
foreach (var sample in samples) {
writer.Write((short)(sample * short.MaxValue));
}
}
return memoryStream.ToArray();
}
}
}
恭喜,您現在可以在 Unity 中使用最先進的語音識別了!
如果您有任何問題或想更多地參與到 Hugging Face for Games 中,請加入 Hugging Face Discord!