Call Center SDK  1.11.3
speech_summary.h
1 /* Copyright 2018 ID R&D Inc. All Rights Reserved. */
2 
3 #pragma once
4 
5 #include <voicesdk/core/config.h>
6 #include <voicesdk/core/common/intervals.h>
7 
8 #include <iostream>
9 #include <vector>
10 #include <memory>
11 
12 namespace voicesdk {
13 
17  struct SpeechEvent {
21  bool isVoice = false;
22 
27 
28  SpeechEvent() = default;
29 
32 
33  friend std::ostream& operator<<(std::ostream& os, const SpeechEvent& obj) {
34  os << "SpeechEvent["
35  << "isVoice:" << obj.isVoice << ", "
36  << "audioInterval:" << obj.audioInterval << "]";
37  return os;
38  }
39 
40  bool operator==(const SpeechEvent& other) const {
41  return other.isVoice == isVoice
42  && other.audioInterval == audioInterval;
43  }
44  };
45 
46  struct SpeechInfo {
50  float speechLengthMs = 0.0f;
51 
55  float backgroundLengthMs = 0.0f;
56 
60  float totalLengthMs = 0.0f;
61 
62  SpeechInfo() = default;
63 
68  {}
69 
70  friend std::ostream& operator<<(std::ostream& os, const SpeechInfo& obj) {
71  os << "SpeechInfo["
72  << "speechLengthMs:" << obj.speechLengthMs << ", "
73  << "backgroundLengthMs:" << obj.backgroundLengthMs << ", "
74  << "totalLengthMs:" << obj.totalLengthMs << "]";
75  return os;
76  }
77 
78  bool operator==(const SpeechInfo& other) const {
79  return other.speechLengthMs == speechLengthMs
80  && other.backgroundLengthMs == backgroundLengthMs
81  && other.totalLengthMs == totalLengthMs;
82  }
83  };
84 
90  struct SpeechSummary {
94  std::vector<SpeechEvent> speechEvents;
95 
100 
101  SpeechSummary() = default;
102 
103  SpeechSummary(const std::vector<SpeechEvent>& speechEvents, const SpeechInfo& speechInfo) :
106  {}
107 
108  friend std::ostream& operator<<(std::ostream& os, const SpeechSummary& obj) {
109  os << "SpeechSummary["
110  << "speechEvents.size():" << obj.speechEvents.size() << ", "
111  << "speechInfo:" << obj.speechInfo << "]";
112  return os;
113  }
114 
115  bool operator==(const SpeechSummary& other) const {
116  return other.speechInfo == speechInfo
117  && other.speechEvents == speechEvents;
118  }
119  };
120 
126  class VOICE_SDK_API SpeechSummaryStream {
127  public:
128 
129  using Ptr = std::shared_ptr<SpeechSummaryStream>;
130 
134  virtual void reset() = 0;
135 
140  virtual void finalize() = 0;
141 
146  virtual bool hasSpeechEvents() const = 0;
147 
155 
161 
166  virtual SpeechInfo getTotalSpeechInfo() const = 0;
167 
173  virtual float getCurrentBackgroundLength() const = 0;
174 
181  virtual void addSamples(
182  const uint8_t *bytes,
183  size_t bytesNum) = 0;
184 
191  virtual void addSamples(
192  const int16_t *pcm16Samples,
193  size_t samplesNum) = 0;
194 
201  virtual void addSamples(
202  const float *floatSamples,
203  size_t samplesNum) = 0;
204 
208  virtual bool compare(const SpeechSummaryStream::Ptr& other) const = 0;
209 
210  virtual ~SpeechSummaryStream() = default;
211  };
212 
217  class VOICE_SDK_API SpeechSummaryStreamOpus {
218  public:
219 
220  using Ptr = std::shared_ptr<SpeechSummaryStreamOpus>;
221 
230  static SpeechSummaryStreamOpus::Ptr create(const std::string& initPath, size_t sampleRate);
231 
235  virtual void reset() = 0;
236 
241  virtual void finalize() = 0;
242 
247  virtual bool hasSpeechEvents() const = 0;
248 
256 
262 
267  virtual SpeechInfo getTotalSpeechInfo() const = 0;
268 
274  virtual float getCurrentBackgroundLength() const = 0;
275 
283  virtual void addPacket(
284  const uint8_t* bytes,
285  size_t bytesNum) = 0;
286 
290  virtual bool compare(const SpeechSummaryStreamOpus::Ptr& other) const = 0;
291 
292  virtual ~SpeechSummaryStreamOpus() = default;
293  };
294 
298  class VOICE_SDK_API SpeechSummaryEngine {
299  public:
300 
301  using Ptr = std::shared_ptr<SpeechSummaryEngine>;
302 
310  static SpeechSummaryEngine::Ptr create(const std::string& initPath = "");
311 
312  virtual ~SpeechSummaryEngine() = default;
313 
323  const uint8_t *bytes,
324  size_t bytesNum,
325  size_t sampleRate) = 0;
326 
336  const int16_t *pcm16Samples,
337  size_t samplesNum,
338  size_t sampleRate) = 0;
339 
349  const float *floatSamples,
350  size_t samplesNum,
351  size_t sampleRate) = 0;
352 
360  const std::string& audioFile) = 0;
361 
362  /* Speech summary stream methods */
363 
370  virtual SpeechSummaryStream::Ptr createStream(int sampleRate) = 0;
371  };
372 }
Speech summary engine class (interface), intended to calculate SpeechSummary with given audio samples...
Definition: speech_summary.h:298
virtual SpeechSummary getSpeechSummary(const float *floatSamples, size_t samplesNum, size_t sampleRate)=0
Calculates speech summary with given float audio samples (in [-1; 1] range)
static SpeechSummaryEngine::Ptr create(const std::string &initPath="")
Creates SpeechSummaryEngine instance.
virtual SpeechSummary getSpeechSummary(const int16_t *pcm16Samples, size_t samplesNum, size_t sampleRate)=0
Calculates speech summary with given PCM16 audio samples.
virtual SpeechSummary getSpeechSummary(const std::string &audioFile)=0
Calculates speech summary with given audio file.
virtual SpeechSummaryStream::Ptr createStream(int sampleRate)=0
Factory method for creating SpeechSummaryStream.
virtual SpeechSummary getSpeechSummary(const uint8_t *bytes, size_t bytesNum, size_t sampleRate)=0
Calculates speech summary with given PCM16 audio samples.
Class for online Opus audio stream processing (voice activity detection and speech statistics computa...
Definition: speech_summary.h:217
static SpeechSummaryStreamOpus::Ptr create(const std::string &initPath, size_t sampleRate)
Creates SpeechSummaryStreamOpus instance.
virtual SpeechEvent getSpeechEvent()=0
Retrieves a single speech event from output FIFO queue. Use hasSpeechEvents() to check if there is av...
virtual void reset()=0
Resets stream state: clears buffer, resets speech summary.
virtual SpeechInfo getTotalSpeechInfo() const =0
Retrieves total accumulated speech info.
virtual float getCurrentBackgroundLength() const =0
Method for retrieving current background length (length of a continuous non-speech segment starting o...
virtual bool hasSpeechEvents() const =0
Checks if there are available speech events in output queue.
virtual SpeechSummary getTotalSpeechSummary() const =0
Retrieves total accumulated speech summary.
virtual bool compare(const SpeechSummaryStreamOpus::Ptr &other) const =0
Check if resulting states of this and another stream are equal.
virtual void addPacket(const uint8_t *bytes, size_t bytesNum)=0
Adds Opus packet to process.
virtual void finalize()=0
Finalizes input audio stream to process remaining audio samples and produce result if it's possible.
Class for online audio stream processing (voice activity detection and speech statistics computation)
Definition: speech_summary.h:126
virtual SpeechSummary getTotalSpeechSummary() const =0
Retrieves total accumulated speech summary.
virtual bool hasSpeechEvents() const =0
Checks if there are available speech events in output queue.
virtual void reset()=0
Resets stream state: clears buffer, resets speech summary.
virtual bool compare(const SpeechSummaryStream::Ptr &other) const =0
Check if resulting states of this and another stream are equal.
virtual float getCurrentBackgroundLength() const =0
Method for retrieving current background length (length of a continuous non-speech segment starting o...
virtual SpeechInfo getTotalSpeechInfo() const =0
Retrieves total accumulated speech info.
virtual void addSamples(const uint8_t *bytes, size_t bytesNum)=0
Adds PCM16 audio samples to process.
virtual void addSamples(const int16_t *pcm16Samples, size_t samplesNum)=0
Adds PCM16 audio samples to process.
virtual SpeechEvent getSpeechEvent()=0
Retrieves a single speech event from output FIFO queue. Use hasSpeechEvents() to check if there is av...
virtual void finalize()=0
Finalizes input audio stream to process remaining audio samples and produce result if it's possible.
virtual void addSamples(const float *floatSamples, size_t samplesNum)=0
Adds float audio samples (in [-1; 1] range) to process.
Structure representing interval of audio data.
Definition: intervals.h:57
Definition: speech_summary.h:17
bool isVoice
Whether the interval contains speech or not.
Definition: speech_summary.h:21
AudioInterval audioInterval
Speech event audio interval.
Definition: speech_summary.h:26
Definition: speech_summary.h:46
float speechLengthMs
Speech signal length in milliseconds.
Definition: speech_summary.h:50
float backgroundLengthMs
Non-speech signal length in milliseconds.
Definition: speech_summary.h:55
float totalLengthMs
Processed audio total length (totalLengthMs = speechLengthMs + backgroundLengthMs) in milliseconds.
Definition: speech_summary.h:60
Structure containing speech statistics and audio intervals marked as speech or non-speech.
Definition: speech_summary.h:90
SpeechInfo speechInfo
Contains speech statistics.
Definition: speech_summary.h:99
std::vector< SpeechEvent > speechEvents
Contains audio intervals marked as speech or non-speech.
Definition: speech_summary.h:94