Soniox成立于2020年,目前总部位于美国加州福斯特城,该公司开发了市场上最好的语音识别引擎之一。该公司目前提供市面上领先的云转录引擎之一——这也是audioXpress成功用于采访和一般语音转文本转换的引擎。
专注于语音AI的Soniox在2021年推出了世界上第一个用于语音识别的无监督学习方法。这一创新对于克服之前阻碍语音系统性能的局限性至关重要。
2023年,Soniox开始从语音AI向通用AI转型。
我们是在2022年开始使用Soniox的。最初的时候Soniox的识别结果是这样的:

识别结果里除了 每个单词的时间,识别质量,还有 Speaker,这个是用来标记,多人对话的语音中,说话者的不同。
2023年开始,Soniox 推出了基于语义分析的Section结果(俺还为他们提供了大量的训练数据)。主要是 SectionId、Title。

Soniox的使用也很简单,在C#的nuget中直接搜索 Soniox,缺点就是 语义分析 的服务比较贵。
syntax = "proto3";
package soniox.speech_service;
option csharp_namespace = "Soniox.Proto.SpeechService";
import "google/protobuf/timestamp.proto";
service SpeechService {
  // Synchronous transcription
  rpc Transcribe(TranscribeRequest) returns (TranscribeResponse) {}
  rpc TranscribeStream(stream TranscribeStreamRequest) returns (stream TranscribeStreamResponse) {}
  rpc TranscribeMeeting(stream TranscribeMeetingRequest) returns (stream TranscribeMeetingResponse) {}
  // Asynchronous transcription
  rpc TranscribeAsync(stream TranscribeAsyncRequest) returns (TranscribeAsyncResponse) {}
  rpc GetTranscribeAsyncStatus(GetTranscribeAsyncStatusRequest) returns (GetTranscribeAsyncStatusResponse) {}
  rpc GetTranscribeAsyncResult(GetTranscribeAsyncResultRequest) returns (stream GetTranscribeAsyncResultResponse) {}
  rpc DeleteTranscribeAsyncFile(DeleteTranscribeAsyncFileRequest) returns (DeleteTranscribeAsyncFileResponse) {}
  // Speech context
  rpc CreateSpeechContext(CreateSpeechContextRequest) returns (CreateSpeechContextResponse) {}
  rpc DeleteSpeechContext(DeleteSpeechContextRequest) returns (DeleteSpeechContextResponse) {}
  rpc ListSpeechContextNames(ListSpeechContextNamesRequest) returns (ListSpeechContextNamesResponse) {}
  rpc GetSpeechContext(GetSpeechContextRequest) returns (GetSpeechContextResponse) {}
  rpc UpdateSpeechContext(UpdateSpeechContextRequest) returns (UpdateSpeechContextResponse) {}
  // Speaker AI
  rpc AddSpeaker(AddSpeakerRequest) returns (AddSpeakerResponse) {}
  rpc GetSpeaker(GetSpeakerRequest) returns (GetSpeakerResponse) {}
  rpc RemoveSpeaker(RemoveSpeakerRequest) returns (RemoveSpeakerResponse) {}
  rpc ListSpeakers(ListSpeakersRequest) returns (ListSpeakersResponse) {}
  rpc AddSpeakerAudio(AddSpeakerAudioRequest) returns (AddSpeakerAudioResponse) {}
  rpc GetSpeakerAudio(GetSpeakerAudioRequest) returns (GetSpeakerAudioResponse) {}
  rpc RemoveSpeakerAudio(RemoveSpeakerAudioRequest) returns (RemoveSpeakerAudioResponse) {}
}
// Transcribe
message TranscribeRequest {
  string api_key = 1;
  TranscriptionConfig config = 4;
  bytes audio = 3;
}
message TranscribeResponse {
  Result result = 1;
  repeated Result channel_results = 2;
}
// TranscribeStream
message TranscribeStreamRequest {
  string api_key = 1;
  TranscriptionConfig config = 4;
  bytes audio = 3;
}
message TranscribeStreamResponse {
  Result result = 1;
}
// TranscribeMeeting
message TranscribeMeetingRequest {
  string api_key = 1;
  TranscriptionConfig config = 10;
  int32 seq_num = 3;
  int32 stream_id = 4;
  bool start_of_segment = 5;
  bytes audio = 6;
  bool end_of_segment = 7;
}
message TranscribeMeetingResponse {
  int32 seq_num = 1;
  int32 stream_id = 2;
  bool start_of_segment = 3;
  bool end_of_segment = 4;
  Result result = 5;
  string error = 6;
}
// TranscribeAsync
message TranscribeAsyncRequest {
  string api_key = 1;
  string reference_name = 3;
  TranscriptionConfig config = 5;
  bytes audio = 4;
}
message TranscribeAsyncResponse {
  string file_id = 1;
}
// GetTranscribeAsyncStatus
message GetTranscribeAsyncStatusRequest {
  string api_key = 1;
  string file_id = 2;
}
message GetTranscribeAsyncStatusResponse {
  repeated TranscribeAsyncFileStatus files = 1;
}
message TranscribeAsyncFileStatus {
  string file_id = 1;
  string reference_name = 2;
  // One of: QUEUED, TRANSCRIBING, COMPLETED, FAILED
  string status = 3;
  // UTC timestamp
  google.protobuf.Timestamp created_time = 4;
  string error_message = 5;
  string transcribe_async_mode = 6;
}
// GetTranscribeAsyncResult
message GetTranscribeAsyncResultRequest {
  string api_key = 1;
  string file_id = 2;
}
message GetTranscribeAsyncResultResponse {
  bool separate_recognition_per_channel = 2;
  Result result = 1;
}
// DeleteTranscribeAsyncFile
message DeleteTranscribeAsyncFileRequest {
  string api_key = 1;
  string file_id = 2;
}
message DeleteTranscribeAsyncFileResponse {
}
// Common
message TranscriptionConfig {
  // Input options
  string audio_format = 1;
  int32 sample_rate_hertz = 2;
  int32 num_audio_channels = 3;
  // Output options
  bool include_nonfinal = 4;
  bool enable_separate_recognition_per_channel = 16;
  // Speech adaptation
  SpeechContext speech_context = 5;
  // Content moderation
  bool enable_profanity_filter = 6;
  repeated string content_moderation_phrases = 7;
  // Speaker diarization
  bool enable_streaming_speaker_diarization = 8;
  bool enable_global_speaker_diarization = 9;
  int32 min_num_speakers = 10;
  int32 max_num_speakers = 11;
  // Speaker identification
  bool enable_speaker_identification = 12;
  repeated string cand_speaker_names = 13;
  // Model options
  string model = 14;
  bool enable_dictation = 15;
  // Asynchronous transcription
  string transcribe_async_mode = 17;
}
message Result {
  repeated Word words = 1;
  int32 final_proc_time_ms = 2;
  int32 total_proc_time_ms = 3;
  repeated ResultSpeaker speakers = 6;
  int32 channel = 7;
}
message Word {
  string text = 1;
  int32 start_ms = 2;
  int32 duration_ms = 3;
  bool is_final = 4;
  int32 speaker = 5;
  string orig_text = 8;
  double confidence = 9;
}
message ResultSpeaker {
  int32 speaker = 1;
  string name = 2;
}
// SpeechContext
message SpeechContext {
  repeated SpeechContextEntry entries = 1;
  string name = 2;
}
message SpeechContextEntry {
  repeated string phrases = 1;
  double boost = 2;
}
message CreateSpeechContextRequest {
  string api_key = 1;
  SpeechContext speech_context = 2;
}
message CreateSpeechContextResponse {
}
message DeleteSpeechContextRequest {
  string api_key = 1;
  string name = 2;
}
message DeleteSpeechContextResponse {
}
message ListSpeechContextNamesRequest {
  string api_key = 1;
}
message ListSpeechContextNamesResponse {
  repeated string names = 1;
}
message GetSpeechContextRequest {
  string api_key = 1;
  string name = 2;
}
message GetSpeechContextResponse {
  SpeechContext speech_context = 1;
}
message UpdateSpeechContextRequest {
  string api_key = 1;
  SpeechContext speech_context = 2;
}
message UpdateSpeechContextResponse {
}
// Speaker AI
// AddSpeaker
message AddSpeakerRequest {
  string api_key = 1;
  string name = 2;
}
message AddSpeakerResponse {
  string name = 1;
  google.protobuf.Timestamp created = 2;
}
// GetSpeaker
message GetSpeakerRequest {
  string api_key = 1;
  string name = 2;
}
message GetSpeakerResponse {
  string name = 1;
  google.protobuf.Timestamp created = 2;
  repeated GetSpeakerResponseAudio audios = 3;
}
message GetSpeakerResponseAudio {
  string audio_name = 1;
  google.protobuf.Timestamp created = 2;
  int32 duration_ms = 3;
}
// RemoveSpeaker
message RemoveSpeakerRequest {
  string api_key = 1;
  string name = 2;
}
message RemoveSpeakerResponse {}
// ListSpeakers
message ListSpeakersRequest {
  string api_key = 1;
}
message ListSpeakersResponse {
  repeated ListSpeakersResponseSpeaker speakers = 1;
}
message ListSpeakersResponseSpeaker {
  string name = 1;
  google.protobuf.Timestamp created = 2;
  int32 num_audios = 3;
}
// AddSpeakerAudio
message AddSpeakerAudioRequest {
  string api_key = 1;
  string speaker_name = 2;
  string audio_name = 3;
  bytes audio = 4;
}
message AddSpeakerAudioResponse {
  string speaker_name = 1;
  string audio_name = 2;
  google.protobuf.Timestamp created = 3;
  int32 duration_ms = 4;
}
// GetSpeakerAudio
message GetSpeakerAudioRequest {
  string api_key = 1;
  string speaker_name = 2;
  string audio_name = 3;
}
message GetSpeakerAudioResponse {
  string speaker_name = 1;
  string audio_name = 2;
  google.protobuf.Timestamp created = 3;
  int32 duration_ms = 4;
  bytes audio = 5;
}
// RemoveSpeakerAudio
message RemoveSpeakerAudioRequest {
  string api_key = 1;
  string speaker_name = 2;
  string audio_name = 3;
}
message RemoveSpeakerAudioResponse {}



















