共计 5410 个字符,预计需要花费 14 分钟才能阅读完成。
目前有些需求确实需要到了 STT 技术,目前各大云厂家都有相关的解决方案,百度云,阿里云,Microsoft Azure 等都有相关的,当然缺点就是网络依赖和付费。
.NET 环境下实现语音转文字的离线识别,重点介绍 Vosk 和 Whisper 这两款具有代表性的 STT 引擎
Vosk
Vosk 是一款基于 Kaldi 的开源语音识别工具,支持多种语言,Windows、Linux、macOS、Android,资源占用较小一点。
官网:https://alphacephei.com/vosk
github:https://github.com/alphacep/vosk-api
模型下载:https://alphacephei.com/vosk/models
Vosk 的模型需要解压,比如下载的是“vosk-model-cn-0.22.zip”那么你需要解压到当前文件夹到项目中。
项目创建一个文件夹为 vosk-model-cn-0.22 目录下包含一些 am、conf、graph 等文件夹,之后读取模型目录就是当前项目的 vosk-model-cn-0.22 文件夹。
using NAudio.Wave;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;
using Vosk;
namespace SpeechToText
{
public class VoskHelper :IDiscern
{
// 定义事件处理器委托
public delegate void InitializationHandler(object sender, EventArgs e);
// 初始化开始事件
public event InitializationHandler InitializationStarting;
// 初始化完成事件
public event InitializationHandler InitializationCompleted;
public float sample_rate {get; set;} = 66000.0f;
private Model model;
/// <summary>
/// 初始化
/// </summary>
/// <param name="path"> 模型路径如:vosk-model-cn-0.22</param>
/// <returns></returns>
public async Task InitializationData(string path)
{Vosk.Vosk.GpuInit();
Vosk.Vosk.SetLogLevel(0);
await Task.Run(() =>
{OnInitializationStarting();
model = new Model(@"" + path);
OnInitializationCompleted();});
}
/// <summary>
/// 识别方法
/// </summary>
/// <param name="path"> 语音路径 </param>
/// <returns></returns>
public async Task<string> Discern(string path)
{
try
{return await Task.Run(() =>
{using (var reader = new AudioFileReader(path))
{sample_rate = reader.WaveFormat.SampleRate;}
VoskRecognizer rec = new VoskRecognizer(model, sample_rate);
rec.SetMaxAlternatives(0);
rec.SetWords(true);
using (Stream source = File.OpenRead(path))
{byte[] buffer = new byte[4096];
int bytesRead;
while ((bytesRead = source.Read(buffer, 0, buffer.Length)) > 0)
{if (rec.AcceptWaveform(buffer, bytesRead))
{// Console.WriteLine(rec.Result());
}
else
{// Console.WriteLine(rec.PartialResult());
}
}
}
string result = ParseVoskJson(rec.FinalResult(), 0.05f);
return result;
});
}
catch (Exception ex)
{return ex.Message;}
}
private string ParseVoskJson(string voskJson, float commaThreshold = 1.0f, float periodThreshold = 2.0f)
{VoskResult voskResult = JsonSerializer.Deserialize<VoskResult>(voskJson);
List<WordInfo> words = voskResult.result;
if (words != null)
{List<string> parsedText = new List<string>();
for (int i = 0; i < words.Count; i++)
{parsedText.Add(words[i].word);
if (i < words.Count - 1)
{float currentEnd = words[i].end;
float nextStart = words[i + 1].start;
float interval = nextStart - currentEnd;
// 添加标点符号
if (interval> periodThreshold)
{parsedText.Add(".");
}
else if (interval> commaThreshold)
{parsedText.Add(",");
}
}
}
parsedText.Add("。");
return string.Join("", parsedText);
}
else
{return "";}
}
protected virtual void OnInitializationStarting()
{InitializationStarting?.Invoke(this, EventArgs.Empty);
}
// 触发初始化完成事件
protected virtual void OnInitializationCompleted()
{InitializationCompleted?.Invoke(this, EventArgs.Empty);
}
public class VoskResult
{public List<WordInfo> result {get; set;}
}
public class WordInfo
{public float conf { get; set;}
public float start {get; set;}
public float end {get; set;}
public string word {get; set;}
}
}
}
Whisper
Whisper 是 OpenAI 开发的一款开源语音识别模型,虽然 Whisper 在资源需求上比 Vosk 更高,但它在准确度上表现还是不错,适合对识别准确率要求较高的应用场景。
github:https://github.com/sandrohanea/whisper.net
模型下载:https://huggingface.co/ggerganov/whisper.cpp/tree/main
using Microsoft.VisualBasic;
using NAudio.Gui;
using NAudio.Wave;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using Whisper.net;
using Whisper.net.LibraryLoader;
namespace SpeechToText
{
public class WhisperHelper
{
public event VoskHelper.InitializationHandler InitializationStarting;
public event VoskHelper.InitializationHandler InitializationCompleted;
private string modelPath = "";
public async Task<string> Discern(string path)
{var file = ConvertWavTo16kHz(path);
// NativeLibraryLoader.SetLibraryLoader(new MyCustomWindowsLibraryLoader());
using (var factory = WhisperFactory.FromPath(modelPath,useGpu:true))
{//var builder = factory.CreateBuilder()
// .WithLanguage("auto").WithPrompt("以下是普通话的句子");
var builder = factory.CreateBuilder()
.WithLanguage("auto").WithPrompt("以下是普通话的句子");
// 初始化 StringBuilder
StringBuilder resultBuilder = new StringBuilder();
using (var processor = builder.Build())
{var enumerator = processor.ProcessAsync(file, CancellationToken.None).GetAsyncEnumerator();
try
{while (await enumerator.MoveNextAsync())
{
var segment = enumerator.Current;
resultBuilder.Append(segment.Text);
}
}
finally
{await enumerator.DisposeAsync();
}
}
return ConvertToSimplified(resultBuilder.ToString());
}
}
static string ConvertToSimplified(string text)
{
// 使用 Microsoft.VisualBasic.Strings.StrConv 方法转换繁体字为简体字
return Strings.StrConv(text, VbStrConv.SimplifiedChinese, 0);
}
public async Task InitializationData(string path)
{modelPath = @"" + path;}
protected virtual void OnInitializationStarting()
{InitializationStarting?.Invoke(this, EventArgs.Empty);
}
// 触发初始化完成事件
protected virtual void OnInitializationCompleted()
{InitializationCompleted?.Invoke(this, EventArgs.Empty);
}
public Stream ConvertWavTo16kHz(string inputFilePath)
{
// 目标采样率
int targetSampleRate = 16000;
using (var reader = new AudioFileReader(inputFilePath))
{
// 创建一个内存流来保存转换后的数据
var memoryStream = new MemoryStream();
// 检查当前文件的采样率
// 如果采样率不等于 16000 赫兹,则进行转换
var outFormat = new WaveFormat(targetSampleRate, reader.WaveFormat.Channels);
using (var resampler = new MediaFoundationResampler(reader, outFormat))
{
resampler.ResamplerQuality = 60; // 设置重采样质量
// 将转换后的音频数据写入内存流
WaveFileWriter.WriteWavFileToStream(memoryStream, resampler);
}
// 重置内存流的位置到起始点
memoryStream.Position = 0;
// 返回内存流
return memoryStream;
}
}
}
}
代码的话自己优化吧,很多东西不是很需要的可以去掉。