You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
369 lines
12 KiB
Go
369 lines
12 KiB
Go
3 years ago
|
package stt
|
||
|
|
||
|
import (
|
||
|
"context"
|
||
|
"fmt"
|
||
|
"os"
|
||
|
"path/filepath"
|
||
|
"runtime/debug"
|
||
|
"strings"
|
||
|
"time"
|
||
|
|
||
|
speech "cloud.google.com/go/speech/apiv1"
|
||
|
"gitlab.com/cinnamon/voiceagent/icsconf"
|
||
|
"gitlab.com/cinnamon/voiceagent/icserror"
|
||
|
"gitlab.com/cinnamon/voiceagent/icslog"
|
||
|
"gitlab.com/cinnamon/voiceagent/recorddata"
|
||
|
"gitlab.com/cinnamon/voiceagent/stt/rms"
|
||
|
speechpb "google.golang.org/genproto/googleapis/cloud/speech/v1"
|
||
|
)
|
||
|
|
||
|
type AsyncCBFunc func(message string)
|
||
|
|
||
|
type STT struct {
|
||
|
voiceBuf []byte
|
||
|
voiceBufCur int64
|
||
|
|
||
|
silencenum int64
|
||
|
validnum int64 //rms counter
|
||
|
//isBargein bool
|
||
|
|
||
|
cbFunc *AsyncCBFunc
|
||
|
}
|
||
|
|
||
|
type STTResult struct {
|
||
|
result string
|
||
|
err *icserror.IcsError
|
||
|
}
|
||
|
|
||
|
const (
|
||
|
MAX_RECORDING_SECOND = 60
|
||
|
ONE_MINUTE_BUFFER_LENGTH = 960000
|
||
|
ONE_SEC = 1000 //1000 msec
|
||
|
ONE_PACKET = 320 //bytes
|
||
|
PACKETS_PER_1SEC = 50 //packet num
|
||
|
MIN_SENTENCE_SPEECH_LEN = 0.1 * ONE_SEC //msec. minimum length of speech a sentence. 0.1sec=1600 bytes
|
||
|
MAX_SENTENCE_SPEECH_LEN = MAX_RECORDING_SECOND * ONE_SEC //msec. max length of speech a sentence.
|
||
|
MAX_SENTENCE_SPEECH_LEN_INBYTE = MAX_RECORDING_SECOND * ONE_PACKET * 50 //bytes.max byte length of speech a sentence. 8kHz 16bit pcm
|
||
|
MAX_SILENCE_LEN = 0.2 * ONE_SEC //msec. max silence length
|
||
|
MIN_RMS_LEVEL = 200.0 //15.0 //check slience
|
||
|
MAX_CONT_VALID_RMS = 15 //times. 1 time is 20msec. continuosly valid speech RMS
|
||
|
MAX_CONT_VALID_RMS2 = 5 //times. 1 time is 20msec. continuosly valid speech RMS
|
||
|
//MAX_CONT_VALID_RMS = 5 //times. 1 time is 20msec. continuosly valid speech RMS
|
||
|
//MAX_SENTENCE_SPEECH_LEN = 9.0 * ONE_SEC //msec. max length of speech a sentence.
|
||
|
//MAX_SENTENCE_SPEECH_LEN_INBYTE = 9.0 * ONE_PACKET * 50 //bytes.max byte length of speech a sentence. 8kHz 16bit pcm
|
||
|
)
|
||
|
|
||
|
func NewSTT(cbFunc AsyncCBFunc) *STT {
|
||
|
//func NewSTT() *STT {
|
||
|
stt := STT{voiceBufCur: 0, silencenum: 0, validnum: 0}
|
||
|
stt.voiceBuf = make([]byte, ONE_MINUTE_BUFFER_LENGTH*2)
|
||
|
//stt.isBargein = false
|
||
|
|
||
|
if cbFunc != nil {
|
||
|
stt.cbFunc = &cbFunc
|
||
|
}
|
||
|
|
||
|
return &stt
|
||
|
}
|
||
|
|
||
|
func (s *STT) Close() {
|
||
|
//l := icslog.GetIcsLog()
|
||
|
//l.Printf(icslog.LOG_LEVEL_DEBUG2, -1, "STT Closed! - %d", s.voiceBufCur)
|
||
|
|
||
|
s.voiceBufCur = 0
|
||
|
s.silencenum = 0
|
||
|
s.validnum = 0
|
||
|
//s.voiceBuf = nil
|
||
|
s.voiceBuf = make([]byte, ONE_MINUTE_BUFFER_LENGTH*2)
|
||
|
//copy(s.voiceBuf[1:], []byte("aaaaaa"))
|
||
|
|
||
|
}
|
||
|
|
||
|
func (s STT) GetVoiceBufCur() int64 {
|
||
|
return s.voiceBufCur
|
||
|
}
|
||
|
|
||
|
func (s *STT) SetAsyncFunc(cbFunc *AsyncCBFunc) {
|
||
|
s.cbFunc = cbFunc
|
||
|
}
|
||
|
|
||
|
func (s *STT) STT(voicedata *recorddata.VoiceDataHDR) (string, float64, *icserror.IcsError) {
|
||
|
var result string
|
||
|
var err *icserror.IcsError
|
||
|
l := icslog.GetIcsLog()
|
||
|
conf := icsconf.GetIcsConfig()
|
||
|
rmsval := rms.RMS(voicedata.Payload)
|
||
|
|
||
|
if rmsval > MIN_RMS_LEVEL {
|
||
|
s.validnum++
|
||
|
s.silencenum = 0
|
||
|
} else {
|
||
|
s.silencenum++
|
||
|
}
|
||
|
|
||
|
//check valid utterance period
|
||
|
if s.silencenum < MAX_CONT_VALID_RMS && s.voiceBufCur < MAX_SENTENCE_SPEECH_LEN_INBYTE-ONE_PACKET {
|
||
|
//fmt.Printf("Voice Buf Size %d-%d\n", s.voiceBufCur, len(s.voiceBuf))
|
||
|
//l.Printf(icslog.LOG_LEVEL_DEBUG2, -1, "%+v", s.voiceBuf)
|
||
|
//fmt.Println(s.voiceBuf[s.voiceBufCur : s.voiceBufCur+int64(len(voicedata.Payload))])
|
||
|
copy(s.voiceBuf[s.voiceBufCur:s.voiceBufCur+int64(len(voicedata.Payload))], voicedata.Payload)
|
||
|
s.voiceBufCur += int64(len(voicedata.Payload))
|
||
|
//fmt.Printf("### RMS: %f, sil: %d, cur: %d val: %d\n", rmsval, s.silencenum, s.voiceBufCur, s.validnum)
|
||
|
/*} else if s.isBargein == false && s.validnum >= MAX_CONT_VALID_RMS2 {
|
||
|
fmt.Printf(">>>99 RMS: %f, sil: %d, cur: %d val: %d\n", rmsval, s.silencenum, s.voiceBufCur, s.validnum)
|
||
|
s.isBargein = true
|
||
|
return "", rmsval, icserror.ICSERRSTTBargeIn */
|
||
|
} else if s.validnum >= MAX_CONT_VALID_RMS && s.silencenum >= int64(MAX_CONT_VALID_RMS) && s.voiceBufCur > int64(float64(5)*float64(ONE_PACKET)*(float64(PACKETS_PER_1SEC/MIN_SENTENCE_SPEECH_LEN))) {
|
||
|
//} else if s.silencenum >= int64(MAX_CONT_VALID_RMS) && s.voiceBufCur > int64(float64(5)*float64(ONE_PACKET)*(float64(PACKETS_PER_1SEC/MIN_SENTENCE_SPEECH_LEN))) {
|
||
|
//} else if s.silencenum >= int64(MAX_CONT_VALID_RMS) && s.voiceBufCur > int64(float64(10)*float64(ONE_PACKET)*(float64(PACKETS_PER_1SEC/MIN_SENTENCE_SPEECH_LEN))) {
|
||
|
rmscheck := rms.RMS(s.voiceBuf[:s.voiceBufCur])
|
||
|
fmt.Printf(">>>108 RMS: %f, sil: %d, cur: %d val: %d\n", rmscheck, s.silencenum, s.voiceBufCur, s.validnum)
|
||
|
//fmt.Printf(">>> RMS: %f, sil: %d, cur: %d val: %d\n", rmscheck, s.silencenum, s.voiceBufCur, s.validnum)
|
||
|
if rmscheck < MIN_RMS_LEVEL {
|
||
|
fmt.Printf(">>>111 RMS: %f, sil: %d, cur: %d val: %d\n", rmscheck, s.silencenum, s.voiceBufCur, s.validnum)
|
||
|
return "", rmscheck, icserror.ICSERRSTTFailEmpty
|
||
|
}
|
||
|
l.Printf(icslog.LOG_LEVEL_INFO, -1, "Start STT. Voice legnth: %d", s.voiceBufCur)
|
||
|
if strings.Compare(conf.STTConfig.Name, "TEST") == 0 {
|
||
|
result, err = sttSim(s.voiceBuf[:s.voiceBufCur])
|
||
|
} else if strings.Compare(conf.STTConfig.Name, "GOOGLE") == 0 {
|
||
|
result, err = stt(s.voiceBuf[:s.voiceBufCur])
|
||
|
} else if strings.Compare(conf.STTConfig.Name, "SELVAS") == 0 {
|
||
|
stts, cerr := NewSTTS(conf.STTConfig.SrcIP, conf.STTConfig.Port)
|
||
|
if cerr != nil {
|
||
|
l.Printf(icslog.LOG_LEVEL_ERROR, -1, "Connect STT ERROR - %s", cerr)
|
||
|
}
|
||
|
result, err = stts.SendSTT(s.voiceBuf[:s.voiceBufCur])
|
||
|
} else if strings.Compare(conf.STTConfig.Name, "READSPEAKER") == 0 {
|
||
|
result, err = stt(s.voiceBuf[:s.voiceBufCur]) // modify readspeaker
|
||
|
}
|
||
|
|
||
|
l.Printf(icslog.LOG_LEVEL_INFO, -1, "End STT - %s", result)
|
||
|
fmt.Printf(">>>116 recog result: %s, RMS: %f, sil: %d, cur: %d val: %d\n", result, rmscheck, s.silencenum, s.voiceBufCur, s.validnum)
|
||
|
return result, rmscheck, err
|
||
|
//return "AAABBB", rmscheck, icserror.ICSERRSTTOK
|
||
|
} else if s.voiceBufCur >= MAX_SENTENCE_SPEECH_LEN_INBYTE {
|
||
|
//return "RESULTRESULTRESULTRESULT bbb", rms, icserror.ICSERRSTTOK
|
||
|
rmscheck := rms.RMS(s.voiceBuf[:s.voiceBufCur])
|
||
|
fmt.Printf("@@@117 RMS: %f, sil: %d, cur: %d val: %d\n", rmscheck, s.silencenum, s.voiceBufCur, s.validnum)
|
||
|
if rmscheck < MIN_RMS_LEVEL {
|
||
|
return "", rmscheck, icserror.ICSERRSTTFailEmpty
|
||
|
}
|
||
|
if strings.Compare(conf.STTConfig.Name, "TEST") == 0 {
|
||
|
result, err = sttSim(s.voiceBuf[:s.voiceBufCur])
|
||
|
} else if strings.Compare(conf.STTConfig.Name, "GOOGLE") == 0 {
|
||
|
result, err = stt(s.voiceBuf[:s.voiceBufCur])
|
||
|
} else if strings.Compare(conf.STTConfig.Name, "SELVAS") == 0 {
|
||
|
stts, cerr := NewSTTS(conf.STTConfig.SrcIP, conf.STTConfig.Port)
|
||
|
if cerr != nil {
|
||
|
l.Printf(icslog.LOG_LEVEL_ERROR, -1, "Connect STT ERROR - %s", cerr)
|
||
|
}
|
||
|
result, err = stts.SendSTT(s.voiceBuf[:s.voiceBufCur])
|
||
|
} else if strings.Compare(conf.STTConfig.Name, "READSPEAKER") == 0 {
|
||
|
result, err = stt(s.voiceBuf[:s.voiceBufCur]) // modify readspeaker
|
||
|
}
|
||
|
|
||
|
return result, rmscheck, err
|
||
|
//return "CCCDDD", rmscheck, icserror.ICSERRSTTOK
|
||
|
}
|
||
|
|
||
|
return "", rmsval, icserror.ICSERRSTTContinue
|
||
|
}
|
||
|
|
||
|
func sttSim(voicedata []byte) (string, *icserror.IcsError) {
|
||
|
return "통화내용이 녹음됩니다 있습니다", icserror.ICSERRSTTOK
|
||
|
}
|
||
|
|
||
|
func stt(voicedata []byte) (string, *icserror.IcsError) {
|
||
|
ctx := context.Background()
|
||
|
var trstring string
|
||
|
|
||
|
// Creates a client.
|
||
|
client, err := speech.NewClient(ctx)
|
||
|
if err != nil {
|
||
|
//log.Fatalf("Failed to create client: %v", err)
|
||
|
//fmt.Printf("Failed to create client: %v\n", err)
|
||
|
icserror.ICSERRSTTFail.SetError(err)
|
||
|
return "", icserror.ICSERRSTTFail
|
||
|
}
|
||
|
defer client.Close()
|
||
|
|
||
|
// Detects speech in the audio buffer.
|
||
|
resp, err := client.Recognize(ctx, &speechpb.RecognizeRequest{
|
||
|
Config: &speechpb.RecognitionConfig{
|
||
|
Encoding: speechpb.RecognitionConfig_LINEAR16,
|
||
|
SampleRateHertz: 8000,
|
||
|
LanguageCode: "ko-KR",
|
||
|
},
|
||
|
Audio: &speechpb.RecognitionAudio{
|
||
|
AudioSource: &speechpb.RecognitionAudio_Content{Content: voicedata},
|
||
|
},
|
||
|
})
|
||
|
if err != nil {
|
||
|
//log.Fatalf("failed to recognize: %v", err)
|
||
|
//fmt.Printf("failed to recognize: %v\n", err)
|
||
|
icserror.ICSERRSTTFail.SetError(err)
|
||
|
return "", icserror.ICSERRSTTFail
|
||
|
}
|
||
|
|
||
|
// Prints the results.
|
||
|
for _, result := range resp.Results {
|
||
|
for _, alt := range result.Alternatives {
|
||
|
//fmt.Printf("\"%v\" (confidence=%3f)\n", alt.Transcript, alt.Confidence)
|
||
|
trstring = alt.Transcript
|
||
|
return trstring, icserror.ICSERRSTTOK
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if len(trstring) <= 0 {
|
||
|
//fmt.Printf("No Recog Result %+v\n", resp)
|
||
|
return trstring, icserror.ICSERRSTTFail
|
||
|
}
|
||
|
|
||
|
return trstring, icserror.ICSERRSTTOK
|
||
|
}
|
||
|
|
||
|
func sttLongVoice(uri string) (string, *icserror.IcsError) {
|
||
|
ctx := context.Background()
|
||
|
|
||
|
// Creates a client.
|
||
|
client, err := speech.NewClient(ctx)
|
||
|
if err != nil {
|
||
|
//log.Fatalf("Failed to create client: %v", err)
|
||
|
//fmt.Printf("Failed to create client: %v\n", err)
|
||
|
icserror.ICSERRSTTFail.SetError(err)
|
||
|
return "", icserror.ICSERRSTTFail
|
||
|
}
|
||
|
defer client.Close()
|
||
|
|
||
|
// Send the contents of the audio file with the encoding and
|
||
|
// and sample rate information to be transcripted.
|
||
|
req := &speechpb.LongRunningRecognizeRequest{
|
||
|
Config: &speechpb.RecognitionConfig{
|
||
|
Encoding: speechpb.RecognitionConfig_LINEAR16,
|
||
|
SampleRateHertz: 8000,
|
||
|
LanguageCode: "ko-KR",
|
||
|
},
|
||
|
Audio: &speechpb.RecognitionAudio{
|
||
|
AudioSource: &speechpb.RecognitionAudio_Uri{Uri: uri},
|
||
|
},
|
||
|
}
|
||
|
|
||
|
op, err := client.LongRunningRecognize(ctx, req)
|
||
|
if err != nil {
|
||
|
//fmt.Printf("failed to recognize: %v\n", err)
|
||
|
icserror.ICSERRSTTFail.SetError(err)
|
||
|
return "", icserror.ICSERRSTTFail
|
||
|
}
|
||
|
_, err = op.Wait(ctx)
|
||
|
//resp, err := op.Wait(ctx)
|
||
|
if err != nil {
|
||
|
//fmt.Printf("failed to recognize: %v\n", err)
|
||
|
icserror.ICSERRSTTFail.SetError(err)
|
||
|
return "", icserror.ICSERRSTTFail
|
||
|
}
|
||
|
|
||
|
// Print the results.
|
||
|
/*
|
||
|
for _, result := range resp.Results {
|
||
|
for _, alt := range result.Alternatives {
|
||
|
fmt.Printf("\"%v\" (confidence=%3f)\n", alt.Transcript, alt.Confidence)
|
||
|
}
|
||
|
}
|
||
|
*/
|
||
|
|
||
|
return "", nil
|
||
|
}
|
||
|
|
||
|
func STTUri(uri string) (string, *icserror.IcsError) {
|
||
|
//return stt(voicebuf)
|
||
|
return sttLongVoice(uri)
|
||
|
}
|
||
|
|
||
|
func STTFile(filepath string) (string, *icserror.IcsError) {
|
||
|
voicebuf := make([]byte, 1024*1024*10)
|
||
|
file, _ := os.OpenFile(filepath, os.O_RDONLY, 0666)
|
||
|
vlen, _ := file.Read(voicebuf)
|
||
|
defer file.Close()
|
||
|
|
||
|
return stt(voicebuf[:vlen])
|
||
|
}
|
||
|
|
||
|
func (s STT) Save(filename string, path string) string {
|
||
|
var url string
|
||
|
l := icslog.GetIcsLog()
|
||
|
burl := []byte(path)
|
||
|
saveYn := icsconf.GetIcsConfig().VoiceConfig.SaveYn
|
||
|
|
||
|
// stt save N, return url
|
||
|
if saveYn == "N" {
|
||
|
return url
|
||
|
}
|
||
|
|
||
|
defer func() {
|
||
|
l := icslog.GetIcsLog()
|
||
|
if err := recover(); err != nil {
|
||
|
switch v := err.(type) {
|
||
|
case error:
|
||
|
l.Printf(icslog.LOG_LEVEL_WARN, -1, "PANIC! [%s] %s|%+v|%+v", v, path, burl, string(debug.Stack()))
|
||
|
}
|
||
|
}
|
||
|
}()
|
||
|
|
||
|
//TODO: check path's accessibility and make path
|
||
|
ymd := fmt.Sprintf("%d%02d%02d", time.Now().Year(), int(time.Now().Month()), time.Now().Day())
|
||
|
dPath := fmt.Sprintf("%s/%s", path, ymd)
|
||
|
|
||
|
if burl[len(burl)-1] == byte('/') {
|
||
|
url = fmt.Sprintf("%s%s%d.pcm", dPath, filename, time.Now().UnixNano())
|
||
|
//fmt.Println(">>>>STT save URL", s.voiceBufCur, url)
|
||
|
} else {
|
||
|
url = fmt.Sprintf("%s/%s%d.pcm", dPath, filename, time.Now().UnixNano())
|
||
|
//fmt.Println("####STT save URL", s.voiceBufCur, url)
|
||
|
}
|
||
|
if saveYn == "N" {
|
||
|
return url
|
||
|
}
|
||
|
|
||
|
// TODO - voice file path
|
||
|
//var oerr error
|
||
|
oerr := os.MkdirAll(filepath.Dir(url), 0777)
|
||
|
if oerr != nil {
|
||
|
icserror.ICSERRMakeDir.SetError(oerr)
|
||
|
icserror.ICSERRMakeDir.PrintWithCaller(0)
|
||
|
fmt.Printf("Make Dir Error : %s", oerr)
|
||
|
}
|
||
|
|
||
|
werr := os.WriteFile(url, s.voiceBuf[:s.voiceBufCur], 0644)
|
||
|
if werr != nil {
|
||
|
l.Printf(icslog.LOG_LEVEL_ERROR, -1, " Write STT File Error - %s \n", werr)
|
||
|
fmt.Printf("File Write Error : %s \n", werr)
|
||
|
}
|
||
|
|
||
|
// os.WriteFile(url, s.voiceBuf[:s.voiceBufCur], 0644)
|
||
|
|
||
|
/*
|
||
|
err := os.WriteFile(url, s.voiceBuf[:s.voiceBufCur], 0644)
|
||
|
if err != nil {
|
||
|
fmt.Println(url, "STT save error", err)
|
||
|
}
|
||
|
*/
|
||
|
|
||
|
return url
|
||
|
}
|
||
|
|
||
|
////////////////////////////////////////////
|
||
|
//new stt result
|
||
|
func NewSTTResult(result string, err *icserror.IcsError) *STTResult {
|
||
|
return &STTResult{result, err}
|
||
|
}
|
||
|
|
||
|
func (s STTResult) GetResult() string {
|
||
|
return s.result
|
||
|
}
|
||
|
|
||
|
func (s STTResult) GetError() *icserror.IcsError {
|
||
|
return s.err
|
||
|
}
|