You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
voicebot/stt/stt.go

369 lines
12 KiB
Go

package stt
import (
"context"
"fmt"
"os"
"path/filepath"
"runtime/debug"
"strings"
"time"
speech "cloud.google.com/go/speech/apiv1"
"gitlab.com/cinnamon/voiceagent/icsconf"
"gitlab.com/cinnamon/voiceagent/icserror"
"gitlab.com/cinnamon/voiceagent/icslog"
"gitlab.com/cinnamon/voiceagent/recorddata"
"gitlab.com/cinnamon/voiceagent/stt/rms"
speechpb "google.golang.org/genproto/googleapis/cloud/speech/v1"
)
type AsyncCBFunc func(message string)
type STT struct {
voiceBuf []byte
voiceBufCur int64
silencenum int64
validnum int64 //rms counter
//isBargein bool
cbFunc *AsyncCBFunc
}
type STTResult struct {
result string
err *icserror.IcsError
}
const (
MAX_RECORDING_SECOND = 60
ONE_MINUTE_BUFFER_LENGTH = 960000
ONE_SEC = 1000 //1000 msec
ONE_PACKET = 320 //bytes
PACKETS_PER_1SEC = 50 //packet num
MIN_SENTENCE_SPEECH_LEN = 0.1 * ONE_SEC //msec. minimum length of speech a sentence. 0.1sec=1600 bytes
MAX_SENTENCE_SPEECH_LEN = MAX_RECORDING_SECOND * ONE_SEC //msec. max length of speech a sentence.
MAX_SENTENCE_SPEECH_LEN_INBYTE = MAX_RECORDING_SECOND * ONE_PACKET * 50 //bytes.max byte length of speech a sentence. 8kHz 16bit pcm
MAX_SILENCE_LEN = 0.2 * ONE_SEC //msec. max silence length
MIN_RMS_LEVEL = 200.0 //15.0 //check slience
MAX_CONT_VALID_RMS = 15 //times. 1 time is 20msec. continuosly valid speech RMS
MAX_CONT_VALID_RMS2 = 5 //times. 1 time is 20msec. continuosly valid speech RMS
//MAX_CONT_VALID_RMS = 5 //times. 1 time is 20msec. continuosly valid speech RMS
//MAX_SENTENCE_SPEECH_LEN = 9.0 * ONE_SEC //msec. max length of speech a sentence.
//MAX_SENTENCE_SPEECH_LEN_INBYTE = 9.0 * ONE_PACKET * 50 //bytes.max byte length of speech a sentence. 8kHz 16bit pcm
)
func NewSTT(cbFunc AsyncCBFunc) *STT {
//func NewSTT() *STT {
stt := STT{voiceBufCur: 0, silencenum: 0, validnum: 0}
stt.voiceBuf = make([]byte, ONE_MINUTE_BUFFER_LENGTH*2)
//stt.isBargein = false
if cbFunc != nil {
stt.cbFunc = &cbFunc
}
return &stt
}
func (s *STT) Close() {
//l := icslog.GetIcsLog()
//l.Printf(icslog.LOG_LEVEL_DEBUG2, -1, "STT Closed! - %d", s.voiceBufCur)
s.voiceBufCur = 0
s.silencenum = 0
s.validnum = 0
//s.voiceBuf = nil
s.voiceBuf = make([]byte, ONE_MINUTE_BUFFER_LENGTH*2)
//copy(s.voiceBuf[1:], []byte("aaaaaa"))
}
func (s STT) GetVoiceBufCur() int64 {
return s.voiceBufCur
}
func (s *STT) SetAsyncFunc(cbFunc *AsyncCBFunc) {
s.cbFunc = cbFunc
}
func (s *STT) STT(voicedata *recorddata.VoiceDataHDR) (string, float64, *icserror.IcsError) {
var result string
var err *icserror.IcsError
l := icslog.GetIcsLog()
conf := icsconf.GetIcsConfig()
rmsval := rms.RMS(voicedata.Payload)
if rmsval > MIN_RMS_LEVEL {
s.validnum++
s.silencenum = 0
} else {
s.silencenum++
}
//check valid utterance period
if s.silencenum < MAX_CONT_VALID_RMS && s.voiceBufCur < MAX_SENTENCE_SPEECH_LEN_INBYTE-ONE_PACKET {
//fmt.Printf("Voice Buf Size %d-%d\n", s.voiceBufCur, len(s.voiceBuf))
//l.Printf(icslog.LOG_LEVEL_DEBUG2, -1, "%+v", s.voiceBuf)
//fmt.Println(s.voiceBuf[s.voiceBufCur : s.voiceBufCur+int64(len(voicedata.Payload))])
copy(s.voiceBuf[s.voiceBufCur:s.voiceBufCur+int64(len(voicedata.Payload))], voicedata.Payload)
s.voiceBufCur += int64(len(voicedata.Payload))
//fmt.Printf("### RMS: %f, sil: %d, cur: %d val: %d\n", rmsval, s.silencenum, s.voiceBufCur, s.validnum)
/*} else if s.isBargein == false && s.validnum >= MAX_CONT_VALID_RMS2 {
fmt.Printf(">>>99 RMS: %f, sil: %d, cur: %d val: %d\n", rmsval, s.silencenum, s.voiceBufCur, s.validnum)
s.isBargein = true
return "", rmsval, icserror.ICSERRSTTBargeIn */
} else if s.validnum >= MAX_CONT_VALID_RMS && s.silencenum >= int64(MAX_CONT_VALID_RMS) && s.voiceBufCur > int64(float64(5)*float64(ONE_PACKET)*(float64(PACKETS_PER_1SEC/MIN_SENTENCE_SPEECH_LEN))) {
//} else if s.silencenum >= int64(MAX_CONT_VALID_RMS) && s.voiceBufCur > int64(float64(5)*float64(ONE_PACKET)*(float64(PACKETS_PER_1SEC/MIN_SENTENCE_SPEECH_LEN))) {
//} else if s.silencenum >= int64(MAX_CONT_VALID_RMS) && s.voiceBufCur > int64(float64(10)*float64(ONE_PACKET)*(float64(PACKETS_PER_1SEC/MIN_SENTENCE_SPEECH_LEN))) {
rmscheck := rms.RMS(s.voiceBuf[:s.voiceBufCur])
fmt.Printf(">>>108 RMS: %f, sil: %d, cur: %d val: %d\n", rmscheck, s.silencenum, s.voiceBufCur, s.validnum)
//fmt.Printf(">>> RMS: %f, sil: %d, cur: %d val: %d\n", rmscheck, s.silencenum, s.voiceBufCur, s.validnum)
if rmscheck < MIN_RMS_LEVEL {
fmt.Printf(">>>111 RMS: %f, sil: %d, cur: %d val: %d\n", rmscheck, s.silencenum, s.voiceBufCur, s.validnum)
return "", rmscheck, icserror.ICSERRSTTFailEmpty
}
l.Printf(icslog.LOG_LEVEL_INFO, -1, "Start STT. Voice legnth: %d", s.voiceBufCur)
if strings.Compare(conf.STTConfig.Name, "TEST") == 0 {
result, err = sttSim(s.voiceBuf[:s.voiceBufCur])
} else if strings.Compare(conf.STTConfig.Name, "GOOGLE") == 0 {
result, err = stt(s.voiceBuf[:s.voiceBufCur])
} else if strings.Compare(conf.STTConfig.Name, "SELVAS") == 0 {
stts, cerr := NewSTTS(conf.STTConfig.SrcIP, conf.STTConfig.Port)
if cerr != nil {
l.Printf(icslog.LOG_LEVEL_ERROR, -1, "Connect STT ERROR - %s", cerr)
}
result, err = stts.SendSTT(s.voiceBuf[:s.voiceBufCur])
} else if strings.Compare(conf.STTConfig.Name, "READSPEAKER") == 0 {
result, err = stt(s.voiceBuf[:s.voiceBufCur]) // modify readspeaker
}
l.Printf(icslog.LOG_LEVEL_INFO, -1, "End STT - %s", result)
fmt.Printf(">>>116 recog result: %s, RMS: %f, sil: %d, cur: %d val: %d\n", result, rmscheck, s.silencenum, s.voiceBufCur, s.validnum)
return result, rmscheck, err
//return "AAABBB", rmscheck, icserror.ICSERRSTTOK
} else if s.voiceBufCur >= MAX_SENTENCE_SPEECH_LEN_INBYTE {
//return "RESULTRESULTRESULTRESULT bbb", rms, icserror.ICSERRSTTOK
rmscheck := rms.RMS(s.voiceBuf[:s.voiceBufCur])
fmt.Printf("@@@117 RMS: %f, sil: %d, cur: %d val: %d\n", rmscheck, s.silencenum, s.voiceBufCur, s.validnum)
if rmscheck < MIN_RMS_LEVEL {
return "", rmscheck, icserror.ICSERRSTTFailEmpty
}
if strings.Compare(conf.STTConfig.Name, "TEST") == 0 {
result, err = sttSim(s.voiceBuf[:s.voiceBufCur])
} else if strings.Compare(conf.STTConfig.Name, "GOOGLE") == 0 {
result, err = stt(s.voiceBuf[:s.voiceBufCur])
} else if strings.Compare(conf.STTConfig.Name, "SELVAS") == 0 {
stts, cerr := NewSTTS(conf.STTConfig.SrcIP, conf.STTConfig.Port)
if cerr != nil {
l.Printf(icslog.LOG_LEVEL_ERROR, -1, "Connect STT ERROR - %s", cerr)
}
result, err = stts.SendSTT(s.voiceBuf[:s.voiceBufCur])
} else if strings.Compare(conf.STTConfig.Name, "READSPEAKER") == 0 {
result, err = stt(s.voiceBuf[:s.voiceBufCur]) // modify readspeaker
}
return result, rmscheck, err
//return "CCCDDD", rmscheck, icserror.ICSERRSTTOK
}
return "", rmsval, icserror.ICSERRSTTContinue
}
func sttSim(voicedata []byte) (string, *icserror.IcsError) {
return "통화내용이 녹음됩니다 있습니다", icserror.ICSERRSTTOK
}
func stt(voicedata []byte) (string, *icserror.IcsError) {
ctx := context.Background()
var trstring string
// Creates a client.
client, err := speech.NewClient(ctx)
if err != nil {
//log.Fatalf("Failed to create client: %v", err)
//fmt.Printf("Failed to create client: %v\n", err)
icserror.ICSERRSTTFail.SetError(err)
return "", icserror.ICSERRSTTFail
}
defer client.Close()
// Detects speech in the audio buffer.
resp, err := client.Recognize(ctx, &speechpb.RecognizeRequest{
Config: &speechpb.RecognitionConfig{
Encoding: speechpb.RecognitionConfig_LINEAR16,
SampleRateHertz: 8000,
LanguageCode: "ko-KR",
},
Audio: &speechpb.RecognitionAudio{
AudioSource: &speechpb.RecognitionAudio_Content{Content: voicedata},
},
})
if err != nil {
//log.Fatalf("failed to recognize: %v", err)
//fmt.Printf("failed to recognize: %v\n", err)
icserror.ICSERRSTTFail.SetError(err)
return "", icserror.ICSERRSTTFail
}
// Prints the results.
for _, result := range resp.Results {
for _, alt := range result.Alternatives {
//fmt.Printf("\"%v\" (confidence=%3f)\n", alt.Transcript, alt.Confidence)
trstring = alt.Transcript
return trstring, icserror.ICSERRSTTOK
}
}
if len(trstring) <= 0 {
//fmt.Printf("No Recog Result %+v\n", resp)
return trstring, icserror.ICSERRSTTFail
}
return trstring, icserror.ICSERRSTTOK
}
func sttLongVoice(uri string) (string, *icserror.IcsError) {
ctx := context.Background()
// Creates a client.
client, err := speech.NewClient(ctx)
if err != nil {
//log.Fatalf("Failed to create client: %v", err)
//fmt.Printf("Failed to create client: %v\n", err)
icserror.ICSERRSTTFail.SetError(err)
return "", icserror.ICSERRSTTFail
}
defer client.Close()
// Send the contents of the audio file with the encoding and
// and sample rate information to be transcripted.
req := &speechpb.LongRunningRecognizeRequest{
Config: &speechpb.RecognitionConfig{
Encoding: speechpb.RecognitionConfig_LINEAR16,
SampleRateHertz: 8000,
LanguageCode: "ko-KR",
},
Audio: &speechpb.RecognitionAudio{
AudioSource: &speechpb.RecognitionAudio_Uri{Uri: uri},
},
}
op, err := client.LongRunningRecognize(ctx, req)
if err != nil {
//fmt.Printf("failed to recognize: %v\n", err)
icserror.ICSERRSTTFail.SetError(err)
return "", icserror.ICSERRSTTFail
}
_, err = op.Wait(ctx)
//resp, err := op.Wait(ctx)
if err != nil {
//fmt.Printf("failed to recognize: %v\n", err)
icserror.ICSERRSTTFail.SetError(err)
return "", icserror.ICSERRSTTFail
}
// Print the results.
/*
for _, result := range resp.Results {
for _, alt := range result.Alternatives {
fmt.Printf("\"%v\" (confidence=%3f)\n", alt.Transcript, alt.Confidence)
}
}
*/
return "", nil
}
func STTUri(uri string) (string, *icserror.IcsError) {
//return stt(voicebuf)
return sttLongVoice(uri)
}
func STTFile(filepath string) (string, *icserror.IcsError) {
voicebuf := make([]byte, 1024*1024*10)
file, _ := os.OpenFile(filepath, os.O_RDONLY, 0666)
vlen, _ := file.Read(voicebuf)
defer file.Close()
return stt(voicebuf[:vlen])
}
func (s STT) Save(filename string, path string) string {
var url string
l := icslog.GetIcsLog()
burl := []byte(path)
saveYn := icsconf.GetIcsConfig().VoiceConfig.SaveYn
// stt save N, return url
if saveYn == "N" {
return url
}
defer func() {
l := icslog.GetIcsLog()
if err := recover(); err != nil {
switch v := err.(type) {
case error:
l.Printf(icslog.LOG_LEVEL_WARN, -1, "PANIC! [%s] %s|%+v|%+v", v, path, burl, string(debug.Stack()))
}
}
}()
//TODO: check path's accessibility and make path
ymd := fmt.Sprintf("%d%02d%02d", time.Now().Year(), int(time.Now().Month()), time.Now().Day())
dPath := fmt.Sprintf("%s/%s", path, ymd)
if burl[len(burl)-1] == byte('/') {
url = fmt.Sprintf("%s%s%d.pcm", dPath, filename, time.Now().UnixNano())
//fmt.Println(">>>>STT save URL", s.voiceBufCur, url)
} else {
url = fmt.Sprintf("%s/%s%d.pcm", dPath, filename, time.Now().UnixNano())
//fmt.Println("####STT save URL", s.voiceBufCur, url)
}
if saveYn == "N" {
return url
}
// TODO - voice file path
//var oerr error
oerr := os.MkdirAll(filepath.Dir(url), 0777)
if oerr != nil {
icserror.ICSERRMakeDir.SetError(oerr)
icserror.ICSERRMakeDir.PrintWithCaller(0)
fmt.Printf("Make Dir Error : %s", oerr)
}
werr := os.WriteFile(url, s.voiceBuf[:s.voiceBufCur], 0644)
if werr != nil {
l.Printf(icslog.LOG_LEVEL_ERROR, -1, " Write STT File Error - %s \n", werr)
fmt.Printf("File Write Error : %s \n", werr)
}
// os.WriteFile(url, s.voiceBuf[:s.voiceBufCur], 0644)
/*
err := os.WriteFile(url, s.voiceBuf[:s.voiceBufCur], 0644)
if err != nil {
fmt.Println(url, "STT save error", err)
}
*/
return url
}
////////////////////////////////////////////
//new stt result
func NewSTTResult(result string, err *icserror.IcsError) *STTResult {
return &STTResult{result, err}
}
func (s STTResult) GetResult() string {
return s.result
}
func (s STTResult) GetError() *icserror.IcsError {
return s.err
}