package stt import ( "context" "fmt" "os" "path/filepath" "runtime/debug" "strings" "time" speech "cloud.google.com/go/speech/apiv1" "gitlab.com/cinnamon/voiceagent/icsconf" "gitlab.com/cinnamon/voiceagent/icserror" "gitlab.com/cinnamon/voiceagent/icslog" "gitlab.com/cinnamon/voiceagent/recorddata" "gitlab.com/cinnamon/voiceagent/stt/rms" speechpb "google.golang.org/genproto/googleapis/cloud/speech/v1" ) type AsyncCBFunc func(message string) type STT struct { voiceBuf []byte voiceBufCur int64 silencenum int64 validnum int64 //rms counter //isBargein bool cbFunc *AsyncCBFunc } type STTResult struct { result string err *icserror.IcsError } const ( MAX_RECORDING_SECOND = 60 ONE_MINUTE_BUFFER_LENGTH = 960000 ONE_SEC = 1000 //1000 msec ONE_PACKET = 320 //bytes PACKETS_PER_1SEC = 50 //packet num MIN_SENTENCE_SPEECH_LEN = 0.1 * ONE_SEC //msec. minimum length of speech a sentence. 0.1sec=1600 bytes MAX_SENTENCE_SPEECH_LEN = MAX_RECORDING_SECOND * ONE_SEC //msec. max length of speech a sentence. MAX_SENTENCE_SPEECH_LEN_INBYTE = MAX_RECORDING_SECOND * ONE_PACKET * 50 //bytes.max byte length of speech a sentence. 8kHz 16bit pcm MAX_SILENCE_LEN = 0.2 * ONE_SEC //msec. max silence length MIN_RMS_LEVEL = 200.0 //15.0 //check slience MAX_CONT_VALID_RMS = 15 //times. 1 time is 20msec. continuosly valid speech RMS MAX_CONT_VALID_RMS2 = 5 //times. 1 time is 20msec. continuosly valid speech RMS //MAX_CONT_VALID_RMS = 5 //times. 1 time is 20msec. continuosly valid speech RMS //MAX_SENTENCE_SPEECH_LEN = 9.0 * ONE_SEC //msec. max length of speech a sentence. //MAX_SENTENCE_SPEECH_LEN_INBYTE = 9.0 * ONE_PACKET * 50 //bytes.max byte length of speech a sentence. 8kHz 16bit pcm ) func NewSTT(cbFunc AsyncCBFunc) *STT { //func NewSTT() *STT { stt := STT{voiceBufCur: 0, silencenum: 0, validnum: 0} stt.voiceBuf = make([]byte, ONE_MINUTE_BUFFER_LENGTH*2) //stt.isBargein = false if cbFunc != nil { stt.cbFunc = &cbFunc } return &stt } func (s *STT) Close() { //l := icslog.GetIcsLog() //l.Printf(icslog.LOG_LEVEL_DEBUG2, -1, "STT Closed! - %d", s.voiceBufCur) s.voiceBufCur = 0 s.silencenum = 0 s.validnum = 0 //s.voiceBuf = nil s.voiceBuf = make([]byte, ONE_MINUTE_BUFFER_LENGTH*2) //copy(s.voiceBuf[1:], []byte("aaaaaa")) } func (s STT) GetVoiceBufCur() int64 { return s.voiceBufCur } func (s *STT) SetAsyncFunc(cbFunc *AsyncCBFunc) { s.cbFunc = cbFunc } func (s *STT) STT(voicedata *recorddata.VoiceDataHDR) (string, float64, *icserror.IcsError) { var result string var err *icserror.IcsError l := icslog.GetIcsLog() conf := icsconf.GetIcsConfig() rmsval := rms.RMS(voicedata.Payload) if rmsval > MIN_RMS_LEVEL { s.validnum++ s.silencenum = 0 } else { s.silencenum++ } //check valid utterance period if s.silencenum < MAX_CONT_VALID_RMS && s.voiceBufCur < MAX_SENTENCE_SPEECH_LEN_INBYTE-ONE_PACKET { //fmt.Printf("Voice Buf Size %d-%d\n", s.voiceBufCur, len(s.voiceBuf)) //l.Printf(icslog.LOG_LEVEL_DEBUG2, -1, "%+v", s.voiceBuf) //fmt.Println(s.voiceBuf[s.voiceBufCur : s.voiceBufCur+int64(len(voicedata.Payload))]) copy(s.voiceBuf[s.voiceBufCur:s.voiceBufCur+int64(len(voicedata.Payload))], voicedata.Payload) s.voiceBufCur += int64(len(voicedata.Payload)) //fmt.Printf("### RMS: %f, sil: %d, cur: %d val: %d\n", rmsval, s.silencenum, s.voiceBufCur, s.validnum) /*} else if s.isBargein == false && s.validnum >= MAX_CONT_VALID_RMS2 { fmt.Printf(">>>99 RMS: %f, sil: %d, cur: %d val: %d\n", rmsval, s.silencenum, s.voiceBufCur, s.validnum) s.isBargein = true return "", rmsval, icserror.ICSERRSTTBargeIn */ } else if s.validnum >= MAX_CONT_VALID_RMS && s.silencenum >= int64(MAX_CONT_VALID_RMS) && s.voiceBufCur > int64(float64(5)*float64(ONE_PACKET)*(float64(PACKETS_PER_1SEC/MIN_SENTENCE_SPEECH_LEN))) { //} else if s.silencenum >= int64(MAX_CONT_VALID_RMS) && s.voiceBufCur > int64(float64(5)*float64(ONE_PACKET)*(float64(PACKETS_PER_1SEC/MIN_SENTENCE_SPEECH_LEN))) { //} else if s.silencenum >= int64(MAX_CONT_VALID_RMS) && s.voiceBufCur > int64(float64(10)*float64(ONE_PACKET)*(float64(PACKETS_PER_1SEC/MIN_SENTENCE_SPEECH_LEN))) { rmscheck := rms.RMS(s.voiceBuf[:s.voiceBufCur]) fmt.Printf(">>>108 RMS: %f, sil: %d, cur: %d val: %d\n", rmscheck, s.silencenum, s.voiceBufCur, s.validnum) //fmt.Printf(">>> RMS: %f, sil: %d, cur: %d val: %d\n", rmscheck, s.silencenum, s.voiceBufCur, s.validnum) if rmscheck < MIN_RMS_LEVEL { fmt.Printf(">>>111 RMS: %f, sil: %d, cur: %d val: %d\n", rmscheck, s.silencenum, s.voiceBufCur, s.validnum) return "", rmscheck, icserror.ICSERRSTTFailEmpty } l.Printf(icslog.LOG_LEVEL_INFO, -1, "Start STT. Voice legnth: %d", s.voiceBufCur) if strings.Compare(conf.STTConfig.Name, "TEST") == 0 { result, err = sttSim(s.voiceBuf[:s.voiceBufCur]) } else if strings.Compare(conf.STTConfig.Name, "GOOGLE") == 0 { result, err = stt(s.voiceBuf[:s.voiceBufCur]) } else if strings.Compare(conf.STTConfig.Name, "SELVAS") == 0 { stts, cerr := NewSTTS(conf.STTConfig.SrcIP, conf.STTConfig.Port) if cerr != nil { l.Printf(icslog.LOG_LEVEL_ERROR, -1, "Connect STT ERROR - %s", cerr) } result, err = stts.SendSTT(s.voiceBuf[:s.voiceBufCur]) } else if strings.Compare(conf.STTConfig.Name, "READSPEAKER") == 0 { result, err = stt(s.voiceBuf[:s.voiceBufCur]) // modify readspeaker } l.Printf(icslog.LOG_LEVEL_INFO, -1, "End STT - %s", result) fmt.Printf(">>>116 recog result: %s, RMS: %f, sil: %d, cur: %d val: %d\n", result, rmscheck, s.silencenum, s.voiceBufCur, s.validnum) return result, rmscheck, err //return "AAABBB", rmscheck, icserror.ICSERRSTTOK } else if s.voiceBufCur >= MAX_SENTENCE_SPEECH_LEN_INBYTE { //return "RESULTRESULTRESULTRESULT bbb", rms, icserror.ICSERRSTTOK rmscheck := rms.RMS(s.voiceBuf[:s.voiceBufCur]) fmt.Printf("@@@117 RMS: %f, sil: %d, cur: %d val: %d\n", rmscheck, s.silencenum, s.voiceBufCur, s.validnum) if rmscheck < MIN_RMS_LEVEL { return "", rmscheck, icserror.ICSERRSTTFailEmpty } if strings.Compare(conf.STTConfig.Name, "TEST") == 0 { result, err = sttSim(s.voiceBuf[:s.voiceBufCur]) } else if strings.Compare(conf.STTConfig.Name, "GOOGLE") == 0 { result, err = stt(s.voiceBuf[:s.voiceBufCur]) } else if strings.Compare(conf.STTConfig.Name, "SELVAS") == 0 { stts, cerr := NewSTTS(conf.STTConfig.SrcIP, conf.STTConfig.Port) if cerr != nil { l.Printf(icslog.LOG_LEVEL_ERROR, -1, "Connect STT ERROR - %s", cerr) } result, err = stts.SendSTT(s.voiceBuf[:s.voiceBufCur]) } else if strings.Compare(conf.STTConfig.Name, "READSPEAKER") == 0 { result, err = stt(s.voiceBuf[:s.voiceBufCur]) // modify readspeaker } return result, rmscheck, err //return "CCCDDD", rmscheck, icserror.ICSERRSTTOK } return "", rmsval, icserror.ICSERRSTTContinue } func sttSim(voicedata []byte) (string, *icserror.IcsError) { return "통화내용이 녹음됩니다 있습니다", icserror.ICSERRSTTOK } func stt(voicedata []byte) (string, *icserror.IcsError) { ctx := context.Background() var trstring string // Creates a client. client, err := speech.NewClient(ctx) if err != nil { //log.Fatalf("Failed to create client: %v", err) //fmt.Printf("Failed to create client: %v\n", err) icserror.ICSERRSTTFail.SetError(err) return "", icserror.ICSERRSTTFail } defer client.Close() // Detects speech in the audio buffer. resp, err := client.Recognize(ctx, &speechpb.RecognizeRequest{ Config: &speechpb.RecognitionConfig{ Encoding: speechpb.RecognitionConfig_LINEAR16, SampleRateHertz: 8000, LanguageCode: "ko-KR", }, Audio: &speechpb.RecognitionAudio{ AudioSource: &speechpb.RecognitionAudio_Content{Content: voicedata}, }, }) if err != nil { //log.Fatalf("failed to recognize: %v", err) //fmt.Printf("failed to recognize: %v\n", err) icserror.ICSERRSTTFail.SetError(err) return "", icserror.ICSERRSTTFail } // Prints the results. for _, result := range resp.Results { for _, alt := range result.Alternatives { //fmt.Printf("\"%v\" (confidence=%3f)\n", alt.Transcript, alt.Confidence) trstring = alt.Transcript return trstring, icserror.ICSERRSTTOK } } if len(trstring) <= 0 { //fmt.Printf("No Recog Result %+v\n", resp) return trstring, icserror.ICSERRSTTFail } return trstring, icserror.ICSERRSTTOK } func sttLongVoice(uri string) (string, *icserror.IcsError) { ctx := context.Background() // Creates a client. client, err := speech.NewClient(ctx) if err != nil { //log.Fatalf("Failed to create client: %v", err) //fmt.Printf("Failed to create client: %v\n", err) icserror.ICSERRSTTFail.SetError(err) return "", icserror.ICSERRSTTFail } defer client.Close() // Send the contents of the audio file with the encoding and // and sample rate information to be transcripted. req := &speechpb.LongRunningRecognizeRequest{ Config: &speechpb.RecognitionConfig{ Encoding: speechpb.RecognitionConfig_LINEAR16, SampleRateHertz: 8000, LanguageCode: "ko-KR", }, Audio: &speechpb.RecognitionAudio{ AudioSource: &speechpb.RecognitionAudio_Uri{Uri: uri}, }, } op, err := client.LongRunningRecognize(ctx, req) if err != nil { //fmt.Printf("failed to recognize: %v\n", err) icserror.ICSERRSTTFail.SetError(err) return "", icserror.ICSERRSTTFail } _, err = op.Wait(ctx) //resp, err := op.Wait(ctx) if err != nil { //fmt.Printf("failed to recognize: %v\n", err) icserror.ICSERRSTTFail.SetError(err) return "", icserror.ICSERRSTTFail } // Print the results. /* for _, result := range resp.Results { for _, alt := range result.Alternatives { fmt.Printf("\"%v\" (confidence=%3f)\n", alt.Transcript, alt.Confidence) } } */ return "", nil } func STTUri(uri string) (string, *icserror.IcsError) { //return stt(voicebuf) return sttLongVoice(uri) } func STTFile(filepath string) (string, *icserror.IcsError) { voicebuf := make([]byte, 1024*1024*10) file, _ := os.OpenFile(filepath, os.O_RDONLY, 0666) vlen, _ := file.Read(voicebuf) defer file.Close() return stt(voicebuf[:vlen]) } func (s STT) Save(filename string, path string) string { var url string l := icslog.GetIcsLog() burl := []byte(path) saveYn := icsconf.GetIcsConfig().VoiceConfig.SaveYn // stt save N, return url if saveYn == "N" { return url } defer func() { l := icslog.GetIcsLog() if err := recover(); err != nil { switch v := err.(type) { case error: l.Printf(icslog.LOG_LEVEL_WARN, -1, "PANIC! [%s] %s|%+v|%+v", v, path, burl, string(debug.Stack())) } } }() //TODO: check path's accessibility and make path ymd := fmt.Sprintf("%d%02d%02d", time.Now().Year(), int(time.Now().Month()), time.Now().Day()) dPath := fmt.Sprintf("%s/%s", path, ymd) if burl[len(burl)-1] == byte('/') { url = fmt.Sprintf("%s%s%d.pcm", dPath, filename, time.Now().UnixNano()) //fmt.Println(">>>>STT save URL", s.voiceBufCur, url) } else { url = fmt.Sprintf("%s/%s%d.pcm", dPath, filename, time.Now().UnixNano()) //fmt.Println("####STT save URL", s.voiceBufCur, url) } if saveYn == "N" { return url } // TODO - voice file path //var oerr error oerr := os.MkdirAll(filepath.Dir(url), 0777) if oerr != nil { icserror.ICSERRMakeDir.SetError(oerr) icserror.ICSERRMakeDir.PrintWithCaller(0) fmt.Printf("Make Dir Error : %s", oerr) } werr := os.WriteFile(url, s.voiceBuf[:s.voiceBufCur], 0644) if werr != nil { l.Printf(icslog.LOG_LEVEL_ERROR, -1, " Write STT File Error - %s \n", werr) fmt.Printf("File Write Error : %s \n", werr) } // os.WriteFile(url, s.voiceBuf[:s.voiceBufCur], 0644) /* err := os.WriteFile(url, s.voiceBuf[:s.voiceBufCur], 0644) if err != nil { fmt.Println(url, "STT save error", err) } */ return url } //////////////////////////////////////////// //new stt result func NewSTTResult(result string, err *icserror.IcsError) *STTResult { return &STTResult{result, err} } func (s STTResult) GetResult() string { return s.result } func (s STTResult) GetError() *icserror.IcsError { return s.err }