C#/.Net 语音转文字离线识别

535次阅读
没有评论

共计 5410 个字符,预计需要花费 14 分钟才能阅读完成。

目前有些需求确实需要到了 STT 技术,目前各大云厂家都有相关的解决方案,百度云,阿里云,Microsoft Azure 等都有相关的,当然缺点就是网络依赖和付费。

.NET 环境下实现语音转文字的离线识别,重点介绍 Vosk 和 Whisper 这两款具有代表性的 STT 引擎

Vosk

Vosk 是一款基于 Kaldi 的开源语音识别工具,支持多种语言,Windows、Linux、macOS、Android,资源占用较小一点。

官网:https://alphacephei.com/vosk

github:https://github.com/alphacep/vosk-api

模型下载:https://alphacephei.com/vosk/models

Vosk 的模型需要解压,比如下载的是“vosk-model-cn-0.22.zip”那么你需要解压到当前文件夹到项目中。

项目创建一个文件夹为 vosk-model-cn-0.22 目录下包含一些 am、conf、graph 等文件夹,之后读取模型目录就是当前项目的 vosk-model-cn-0.22 文件夹。

using NAudio.Wave;

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;

using Vosk;

namespace SpeechToText
{
    public class VoskHelper :IDiscern
    {
        // 定义事件处理器委托
        public delegate void InitializationHandler(object sender, EventArgs e);

        // 初始化开始事件
        public event InitializationHandler InitializationStarting;

        // 初始化完成事件
        public event InitializationHandler InitializationCompleted;
        public float sample_rate {get; set;} = 66000.0f;
        private Model model;
        /// <summary>
        /// 初始化
        /// </summary>
        /// <param name="path"> 模型路径如:vosk-model-cn-0.22</param>
        /// <returns></returns>
        public async Task InitializationData(string path)
        {Vosk.Vosk.GpuInit();
            Vosk.Vosk.SetLogLevel(0);
            await Task.Run(() =>
            {OnInitializationStarting();
                model = new Model(@"" + path);
                OnInitializationCompleted();});
          
        }

        /// <summary>
        /// 识别方法
        /// </summary>
        /// <param name="path"> 语音路径 </param>
        /// <returns></returns>
        public async Task<string> Discern(string path)
        {
            try
            {return await Task.Run(() =>
             {using (var reader = new AudioFileReader(path))
                 {sample_rate = reader.WaveFormat.SampleRate;}

           
                 VoskRecognizer rec = new VoskRecognizer(model, sample_rate);
               
                 rec.SetMaxAlternatives(0);
                 rec.SetWords(true);
                 using (Stream source = File.OpenRead(path))
                 {byte[] buffer = new byte[4096];
                     int bytesRead;
                     while ((bytesRead = source.Read(buffer, 0, buffer.Length)) > 0)
                     {if (rec.AcceptWaveform(buffer, bytesRead))
                         {//    Console.WriteLine(rec.Result());
                         }
                         else
                         {//      Console.WriteLine(rec.PartialResult());
                         }
                     }
                 }
                 string result = ParseVoskJson(rec.FinalResult(), 0.05f);
                 return result;
             });
            }
            catch (Exception ex)
            {return ex.Message;}
        }
        private string ParseVoskJson(string voskJson, float commaThreshold = 1.0f, float periodThreshold = 2.0f)
        {VoskResult voskResult = JsonSerializer.Deserialize<VoskResult>(voskJson);
            List<WordInfo> words = voskResult.result;
            if (words != null)
            {List<string> parsedText = new List<string>();

                for (int i = 0; i < words.Count; i++)
                {parsedText.Add(words[i].word);

                    if (i < words.Count - 1)
                    {float currentEnd = words[i].end;
                        float nextStart = words[i + 1].start;
                        float interval = nextStart - currentEnd;

                        // 添加标点符号
                        if (interval> periodThreshold)
                        {parsedText.Add(".");
                        }
                        else if (interval> commaThreshold)
                        {parsedText.Add(",");
                        }
                    }
                }
                parsedText.Add("。");
                return string.Join("", parsedText);
            }
            else
            {return "";}
        }
        protected virtual void OnInitializationStarting()
        {InitializationStarting?.Invoke(this, EventArgs.Empty);
        }

        // 触发初始化完成事件
        protected virtual void OnInitializationCompleted()
        {InitializationCompleted?.Invoke(this, EventArgs.Empty);
        }
        public class VoskResult
        {public List<WordInfo> result {get; set;}
        }

        public class WordInfo
        {public float conf { get; set;}
            public float start {get; set;}
            public float end {get; set;}
            public string word {get; set;}
        }
    }
}

Whisper

Whisper 是 OpenAI 开发的一款开源语音识别模型,虽然 Whisper 在资源需求上比 Vosk 更高,但它在准确度上表现还是不错,适合对识别准确率要求较高的应用场景。

github:https://github.com/sandrohanea/whisper.net

模型下载:https://huggingface.co/ggerganov/whisper.cpp/tree/main

using Microsoft.VisualBasic;
using NAudio.Gui;
using NAudio.Wave;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using Whisper.net;
using Whisper.net.LibraryLoader;

namespace SpeechToText
{
    public class WhisperHelper 
    {
        public event VoskHelper.InitializationHandler InitializationStarting;
        public event VoskHelper.InitializationHandler InitializationCompleted;
        private string modelPath = "";
        public async Task<string> Discern(string path)
        {var file = ConvertWavTo16kHz(path);
          //  NativeLibraryLoader.SetLibraryLoader(new MyCustomWindowsLibraryLoader());
            using (var factory = WhisperFactory.FromPath(modelPath,useGpu:true))
            {//var builder = factory.CreateBuilder()
                //    .WithLanguage("auto").WithPrompt("以下是普通话的句子");

                var builder = factory.CreateBuilder()
              .WithLanguage("auto").WithPrompt("以下是普通话的句子");
                // 初始化 StringBuilder
                StringBuilder resultBuilder = new StringBuilder();
                using (var processor = builder.Build())
                {var enumerator = processor.ProcessAsync(file, CancellationToken.None).GetAsyncEnumerator();
                    try
                    {while (await enumerator.MoveNextAsync())
                        {
                            var segment = enumerator.Current;
                            resultBuilder.Append(segment.Text);
                        }
                    }
                    finally
                    {await enumerator.DisposeAsync();
                    }

                }
                return ConvertToSimplified(resultBuilder.ToString());
                 
            }
        }
        static string ConvertToSimplified(string text)
        {
            // 使用 Microsoft.VisualBasic.Strings.StrConv 方法转换繁体字为简体字
            return Strings.StrConv(text, VbStrConv.SimplifiedChinese, 0);
        }

        public async Task InitializationData(string path)
        {modelPath = @"" + path;}

        protected virtual void OnInitializationStarting()
        {InitializationStarting?.Invoke(this, EventArgs.Empty);
        }

        // 触发初始化完成事件
        protected virtual void OnInitializationCompleted()
        {InitializationCompleted?.Invoke(this, EventArgs.Empty);
        }
        public Stream ConvertWavTo16kHz(string inputFilePath)
        {
            // 目标采样率
            int targetSampleRate = 16000;

            using (var reader = new AudioFileReader(inputFilePath))
            {
                // 创建一个内存流来保存转换后的数据
                var memoryStream = new MemoryStream();

                // 检查当前文件的采样率
                
                    // 如果采样率不等于 16000 赫兹,则进行转换
                    var outFormat = new WaveFormat(targetSampleRate, reader.WaveFormat.Channels);

                    using (var resampler = new MediaFoundationResampler(reader, outFormat))
                    {
                        resampler.ResamplerQuality = 60; // 设置重采样质量

                        // 将转换后的音频数据写入内存流
                        WaveFileWriter.WriteWavFileToStream(memoryStream, resampler);
                    }
               

                // 重置内存流的位置到起始点
                memoryStream.Position = 0;

                // 返回内存流
                return memoryStream;
            }
        }
    }
}

代码的话自己优化吧,很多东西不是很需要的可以去掉。

正文完
 1
XSTPLAN
版权声明:本站原创文章,由 XSTPLAN 于2024-08-12发表,共计5410字。
转载说明:本站所有资源和文章版权归作者所有,未经授权禁止转载。如有转载或引用,请注明来源。
评论(没有评论)