// file: $isip_ifc/class/sp/SignalDetector/SignalDetector.h // version: $Id: SignalDetector.h 10214 2005-08-15 05:01:23Z stanley $ // // make sure definitions are only made once // #ifndef ISIP_SIGNAL_DETECTOR #define ISIP_SIGNAL_DETECTOR // isip include files // #ifndef ISIP_VECTOR #include #endif #ifndef ISIP_CIRCULAR_BUFFER #include #endif #ifndef ISIP_VECTOR_FLOAT #include #endif #ifndef ISIP_BOOLEAN #include #endif #ifndef ISIP_DEBUG_LEVEL #include #endif // SignalDetector: This class is used to detect the start and stop // times of a segment of data in a stream of data. The audio data, // which is represented as a multichannel sequence of floating point // data is handled as a vector of vector floats. The audio data is // passed to this class in variable-sized amounts of data. Methods // are provided to determine the state of the detection, including when // a valid signal has been detected. // class SignalDetector { //--------------------------------------------------------------------------- // // public constants // //--------------------------------------------------------------------------- public: // define the class name // static const String CLASS_NAME; //---------------------------------------- // // other important constants // //---------------------------------------- // define the algorithm choices // enum ALGORITHM { ENERGY = 0, ENERGY_ZC , DEF_ALGORITHM = ENERGY }; // define the implementation choices // enum IMPLEMENTATION { DB_POWER = 0, DEF_IMPLEMENTATION = DB_POWER }; // define the precision of the endpoints: // the endpoints are successively refined // enum PRECISION { COARSE = 0, MEDIUM, FINE, DEF_PRECISION = COARSE }; // define the static objects // static const NameMap ALGO_MAP; static const NameMap IMPL_MAP; //---------------------------------------- // // i/o related constants // //---------------------------------------- // constants relating to implementation and algorithm. // static const String DEF_PARAM; static const String PARAM_ALGORITHM; static const String PARAM_IMPLEMENTATION; // constants relating to signal processing // static const String PARAM_NUM_CHANNELS; static const String PARAM_SAMPLE_FREQUENCY; static const String PARAM_SAMPLE_NUM_BYTES; static const String PARAM_CHANNEL_TO_BE_PROCESSED; static const String PARAM_FRAME_DURATION; static const String PARAM_WINDOW_DURATION; // constants relating to preprocessing of the audio data // static const String PARAM_PREEMPHASIS; // constants related to the output signal // static const String PARAM_PAD_TIME; // this section contains data common to all algorithms: // the basic assumption in this class is that the signal // and noise have different durational properties // // constants relating to the energy levels // static const String PARAM_SIGNAL_NOMINAL_LEVEL; static const String PARAM_SIGNAL_ADAPTATION_DELTA; static const String PARAM_SIGNAL_ADAPTATION_CONSTANT; // constants relating to noise levels // static const String PARAM_NOISE_NOMINAL_LEVEL; static const String PARAM_NOISE_ADAPTATION_DELTA; static const String PARAM_NOISE_ADAPTATION_CONSTANT; static const String PARAM_NOISE_FLOOR; // constants related to utterance duration and separation // static const String PARAM_UTTERANCE_MINIMUM_DURATION; static const String PARAM_UTTERANCE_MAXIMUM_DURATION; static const String PARAM_UTTERANCE_MINIMUM_SEPARATION; static const String PARAM_UTTERANCE_DELTA; // this section contains data for a specific algorithm // // algorithm: energy_zc // implementation: db_power // description: combine a zero-crossing measure with energy // static const String PARAM_ZC_UTTERANCE_THRESH; static const String PARAM_ZC_NEGATIVE_THRESH; static const String PARAM_ZC_POSITIVE_THRESH; // constants for debug level // static const String PARAM_DBGL; //---------------------------------------- // // default values and arguments // //---------------------------------------- // constants relating to signal processing // static const int32 DEF_NUM_CHANNELS = 1; static const float32 DEF_SAMPLE_FREQUENCY = 16000.0; static const int32 DEF_SAMPLE_NUM_BYTES = 2; static const int32 DEF_CHANNEL_TO_BE_PROCESSED = 1; static const float32 DEF_FRAME_DURATION = 0.020; static const float32 DEF_WINDOW_DURATION = 0.030; static const int32 DEF_CBUF_LENGTH = 3000000; // constants relating to preprocessing of the audio data // static const float32 DEF_PREEMPHASIS = 0.95; // constants related to the output signal // static const float32 DEF_PAD_TIME = 0.10; // this section contains data common to all algorithms: // the basic assumption in this class is that the signal // and noise have different durational properties // // constants relating to the energy levels // static const float32 DEF_SIGNAL_NOMINAL_LEVEL = -30.0; static const float32 DEF_SIGNAL_ADAPTATION_DELTA = 5.0; static const float32 DEF_SIGNAL_ADAPTATION_CONSTANT = 0.50; // constants relating to noise levels // static const float32 DEF_NOISE_NOMINAL_LEVEL = -40.0; static const float32 DEF_NOISE_ADAPTATION_DELTA = 5.0; static const float32 DEF_NOISE_ADAPTATION_CONSTANT = 0.75; static const float32 DEF_NOISE_FLOOR = -80.0; // constants related to utterance duration and separation // static const float32 DEF_UTTERANCE_MINIMUM_DURATION = 0.2; static const float32 DEF_UTTERANCE_MINIMUM_SEPARATION = 0.1; static const float32 DEF_UTTERANCE_MAXIMUM_DURATION = 100.0; static const float32 DEF_UTTERANCE_DELTA = 10.0; // state machine related parameters: // the basic approach in this class is to use a state // machine to keep track of the previous state of the // process. we use a state machine with four states // and we use the three previous states to make decisions // static const int32 SM_NSTATES = 4; static const int32 SM_HISTORY = 3; static const int32 SM_STATE_NOIS = 0; static const int32 SM_STATE_NTOS = 1; static const int32 SM_STATE_STON = 2; static const int32 SM_STATE_SIGN = 3; static const float32 SM_WGT_NOIS = 0.0; static const float32 SM_WGT_SIGN = 1.0; static const float32 SM_WGT_TRAN = 0.5; // sample value scaling // 1/DEF_SAMPLE_SCALE_FACTOR = 32767. So we // normalize the sample values // static const float64 DEF_SAMPLE_SCALE_FACTOR = 3.051850e-05; // this section contains data for a specific algorithm // // algorithm: energy_zc // implementation: db_power // description: combine a zero-crossing measure with energy // static const int32 DEF_ZC_UTTERANCE_THRESH = 0; static const int32 DEF_ZC_POSITIVE_THRESH = 0; static const int32 DEF_ZC_NEGATIVE_THRESH = 0; //---------------------------------------- // // error codes // //---------------------------------------- static const int32 ERR = 80500; static const int32 ERR_AUDIO_BUFFER_OVERFLOW = 80510; static const int32 ERR_ENERGY_BUFFER_OVERFLOW = 80520; static const int32 ERR_ZC_BUFFER_OVERFLOW = 80530; static const int32 ERR_OUT_OF_BOUNDS = 80540; //--------------------------------------------------------------------------- // // protected data // //--------------------------------------------------------------------------- protected: // algorithm name // ALGORITHM algorithm_d; // implementation name // IMPLEMENTATION implementation_d; // static memory manager // static MemoryManager mgr_d; // debugging parameters // DebugLevel debug_level_d; // this flag is used to determine if the object's init method // needs to be called // bool8 is_valid_d; // circular buffer which is used to hold audio data // Vector > cbuf_d; //---------------------------------------------------- // // define parameters that are written to the parmeter file // //---------------------------------------------------- // variables relating to signal processing // Long num_channels_d; Float sample_frequency_d; Long sample_num_bytes_d; Long channel_to_be_processed_d; Float frame_duration_d; Float window_duration_d; // variables relating to preprocessing of the audio data // Float preemphasis_d; // variable related to the output signal // Float pad_time_d; // signal level-related energy parameters // Float signal_nominal_level_d; Float signal_adaptation_delta_d; Float signal_adaptation_constant_d; // noise level-related energy parameters // Float noise_nominal_level_d; Float noise_adaptation_delta_d; Float noise_adaptation_constant_d; Float noise_floor_d; // utterance-related parameters // Float utterance_minimum_duration_d; Float utterance_maximum_duration_d; Float utterance_minimum_separation_d; Float utterance_delta_d; // adaptive thresholds // Float noise_threshold_d; Float signal_threshold_d; // these parameters are calculated from the values calculated from // the parameter file // // variables relating to signal processing // int32 frame_sample_duration_d; int32 window_sample_duration_d; // utterance-related parameters // int32 utterance_minimum_sample_duration_d; int32 utterance_minimum_sample_separation_d; //---------------------------------------------------- // // define parameters that are common to all algorithms // and implementations // //---------------------------------------------------- // state machine // Vector states_d; Vector durations_d; // useful counters // Vector num_frame_d; // define variables to handle the energy values // Vector > egy_d; // length of the energy buffer // int32 egy_len_d; Vector egy_cur_d; Vector egy_rlse_d; Vector egy_data_window_d; // energy patterns used to detect signal or noise // Vector sig_patn_d; int32 sig_patn_len_d; Vector nse_patn_d; int32 nse_patn_len_d; // information that tracks the state of the detection process // Vector utt_in_progress_d; // keeps track of the number of endpoints // Vector num_endpoints_d; Vector start_points_d; Vector stop_points_d; Vector utt_coarse_beg_d; Vector utt_coarse_end_d; Vector utt_egy_beg_d; Vector utt_egy_end_d; Vector utt_zc_beg_d; Vector utt_zc_end_d; // algorithm: energy_zc // implementation: db_power // description: combine a zero-crossing measure with energy // // define variables to handle the zero crossing values // Vector > zc_d; Vector zc_data_window_d; Long zc_utterance_thresh_d; Long zc_negative_thresh_d; Long zc_positive_thresh_d; // useful scale factors // win_dur_scale_factor_d = 1/(number of samples in a window of data) // this value is used in the enrgy calculation // Float win_dur_scale_factor_d; //--------------------------------------------------------------------------- // // required public methods // //--------------------------------------------------------------------------- public: // method: name // static const String& name() { return CLASS_NAME; } // other static methods // static bool8 diagnose(Integral::DEBUG debug_level); // debug methods: // setDebug is inherited from the base class // bool8 debug(const unichar* msg) const; // method: destructor // ~SignalDetector(); // method: default constructor // SignalDetector(ALGORITHM algorithm, IMPLEMENTATION implementation) { algorithm_d = algorithm; implementation_d = implementation; is_valid_d = false; } // default constructor // SignalDetector(); // method: copy constructor // SignalDetector(const SignalDetector& arg) { assign(arg); } // assign methods // bool8 assign(const SignalDetector& arg); // method: operator= // SignalDetector& operator= (const SignalDetector& arg) { assign(arg); return *this; } // i/o methods // int32 sofSize() const; bool8 read(Sof& sof, int32 tag, const String& name = CLASS_NAME); bool8 write(Sof& sof, int32 tag, const String& name = CLASS_NAME) const; bool8 readData(Sof& sof, const String& pname = DEF_PARAM, int32 size = SofParser::FULL_OBJECT, bool8 param = true, bool8 nested = false); bool8 writeData(Sof& sof, const String& pname = DEF_PARAM) const; // equality methods // bool8 eq(const SignalDetector& arg) const; // method: new // static void* operator new(size_t size) { return mgr_d.get(); } // method: new[] // static void* operator new[](size_t size) { return mgr_d.getBlock(size); } // method: delete // static void operator delete(void* ptr) { mgr_d.release(ptr); } // method: delete[] // static void operator delete[](void* ptr) { mgr_d.releaseBlock(ptr); } // method: setGrowSize // static bool8 setGrowSize(int32 grow_size) { return mgr_d.setGrow(grow_size); } // other memory management methods // bool8 clear(Integral::CMODE ctype = Integral::DEF_CMODE); // method to set the parser // bool8 setParser(SofParser* parser); //--------------------------------------------------------------------------- // // class-specific public methods: // set methods // //--------------------------------------------------------------------------- // method: setAlgorithm // bool8 setAlgorithm(ALGORITHM algorithm) { algorithm_d = algorithm; is_valid_d = false; return true; } // method: setImplementation // bool8 setImplementation(IMPLEMENTATION implementation) { implementation_d = implementation; is_valid_d = false; return true; } // method: set // bool8 set(ALGORITHM algorithm = DEF_ALGORITHM, IMPLEMENTATION implementation = DEF_IMPLEMENTATION) { algorithm_d = algorithm; implementation_d = implementation; is_valid_d = false; return true; } // other set methods // bool8 setdefaultParameters(); bool8 setNumChannels(int32 num_channels); bool8 setSampleNumBytes(int32 sample_num_bytes); bool8 setSampleFrequency(float32 sample_rate); bool8 setChannelToBeProcessed(int32 chan); bool8 setFrameDuration(float32 fram_dur); bool8 setWindowDuration(float32 win_dur); bool8 setPreemphasis(float32 premphasis); bool8 setSigNominalLevel(float32 nom_sig_lev); bool8 setSigAdaptDelta(float32 sig_delta); bool8 setSigAdaptConst(float32 signal_adapt); bool8 setNoiseNominalLevel(float32 nom_noise_lev); bool8 setNoiseAdaptDelta(float32 noise_delta); bool8 setNoiseAdaptConst(float32 noise_adapt); bool8 setNoiseFloor(float32 noise_floor); bool8 setUttDelta(float32 utt_delta); bool8 setMinUttDur(float32 min_utt_dur); bool8 setMinUttSep(float32 min_utt_sep); bool8 setMaxUttDur(float32 max_utt_dur); bool8 setZcUttThreshold(int32 zc_utt_thresh); bool8 setZcNegThreshold(int32 zc_neg_thresh); bool8 setZcPosThreshold(int32 zc_pos_thresh); bool8 setPadTime(float32 pad_time); // TODO // bool8 setUttStatus(bool8 utt_status); //--------------------------------------------------------------------------- // // class-specific public methods: // get methods // //--------------------------------------------------------------------------- // method: getAlgorithm // ALGORITHM getAlgorithm() const { return algorithm_d; } // method: getImplementation // IMPLEMENTATION getImplementation() const { return implementation_d; } // method: get // bool8 get(ALGORITHM& algorithm, IMPLEMENTATION& implementation) { algorithm = algorithm_d; implementation = implementation_d; return true; } // other get methods // int32 getNumChannels(); int32 getSampleNumBytes(); int32 getSampleFrequency(); int32 getChannelToBeProcessed(); float32 getFrameDuration(); float32 getWindowDuration(); float32 getPreemphasis(); float32 getSigNominalLevel(); float32 getSigAdaptDelta(); float32 getSigAdaptConst(); float32 getNoiseNominalLevel(); float32 getNoiseAdaptDelta(); float32 getNoiseAdaptConst(); float32 getNoiseFloor(); float32 getUttDelta(); float32 getMinUttDur(); float32 getMinUttSep(); float32 getMaxUttDur(); int32 getZcUttThreshold(); int32 getZcNegThreshold(); int32 getZcPosThreshold(); int32 getPadTime(); Vector getUttStatus(); bool8 getEndTime(VectorFloat& start_time, VectorFloat& end_time, int32 channel_index_a); //--------------------------------------------------------------------------- // // class-specific public methods: // // //--------------------------------------------------------------------------- // this method initializes the detector and clears all buffers // bool8 init(); // this method clears all circular buffers only if an utterance is not // in progress. it is used to prevent the detector from accumulating // large amounts of memory while it is processing noise. // bool8 flush(); // this method resets the detector to the start state in which it // can begin processing a new utterance // bool8 reset(int32 channel_index_a); // these methods processing an incoming buffer of data. note // that data can be passed to this algorithm in variable sized // chunks. the apply method accepts multichannel data; the // compute method processes a single channel of data. // int32 apply(Vector audio_data); int32 compute(int32 channel_index_a); // these methods allow a user to get the current endpoints // while the detection algorithm is in progress. these methods // can be used to get the endpoints once a valid utterance has // been found. // bool8 computeEndPoints(PRECISION precision, int32 channel_index_a); bool8 computeEndTime(float32& start_time, float32& stop_time, int32 channel_index_a); bool8 computeStartTime(float32& start_time, int32 channel_index_a); //--------------------------------------------------------------------------- // // private methods // //--------------------------------------------------------------------------- private: // methods common to all algorithms and implementations // // update all adaptive thresholds // bool8 updateThresholds(float64 egy); // state machine related methods // bool8 stateMachineReset(int32 channel_index_a); int32 stateMachineAdvance(float32 egy, int32 channel_index_a); // circular buffer related methods // int32 cbScanForPattern(Vector pattern, int32 len, int32 start_frame, int32 channel_index_a); bool8 cbSmoothSignal(int32 channel_index_a); bool8 cbSmoothNoise(int32 channel_index_a); bool8 cbSmoothTransitions(int32 channel_index_a); bool8 cbUttInProgress(int32 channel_index_a); bool8 cbUttNotInProgress(int32 channel_index_a); bool8 cbRemoveShortSignalBursts(int32 channel_index_a); bool8 cbRemoveShortNoiseBursts(int32 channel_index_a); bool8 cbRemoveLongNoiseBursts(int32 channel_index_a); bool8 cbPurge(int32 channel_index_a); bool8 cbRelease(int32 frame_index, int32 channel_index_a); bool8 cbZeroCrossingRelease(int32 frame_index); // description: standard energy computation // float32 computeEnergy(VectorFloat signal, int32 channel_index_a); // algorithm: energy_zc // implementation: db_power // description: combine a zero-crossing measure with energy // int32 computeZeroCrossingRate(VectorFloat signal, int32 channel_index_a); }; // end of include file // #endif