SphinxBase 0.6
src/libsphinxad/cont_ad_base.c
00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
00002 /* ====================================================================
00003  * Copyright (c) 1999-2001 Carnegie Mellon University.  All rights
00004  * reserved.
00005  *
00006  * Redistribution and use in source and binary forms, with or without
00007  * modification, are permitted provided that the following conditions
00008  * are met:
00009  *
00010  * 1. Redistributions of source code must retain the above copyright
00011  *    notice, this list of conditions and the following disclaimer. 
00012  *
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in
00015  *    the documentation and/or other materials provided with the
00016  *    distribution.
00017  *
00018  * This work was supported in part by funding from the Defense Advanced 
00019  * Research Projects Agency and the National Science Foundation of the 
00020  * United States of America, and the CMU Sphinx Speech Consortium.
00021  *
00022  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
00023  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
00024  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00025  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
00026  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00027  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
00028  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
00029  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
00030  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
00031  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
00032  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00033  *
00034  * ====================================================================
00035  *
00036  */
00037 /*
00038  * cont_ad.c -- Continuous A/D listening and silence filtering module.
00039  * 
00040  * HISTORY
00041  * 
00042  * $Log: cont_ad_base.c,v $
00043  * Revision 1.14  2005/07/02 03:51:32  rkm
00044  * Slowed down power histogram decay rate
00045  *
00046  * Revision 1.13  2005/06/30 00:27:17  rkm
00047  * Fixed silence handling in rawmode; added extra state variables
00048  *
00049  * 
00050  * 28-Jun-2005  M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University.
00051  *              - Changed rawmode handling to simply copy data even for silence
00052  *              segments.
00053  *              - Moved definitions of CONT_AD_STATE_{SIL,SPEECH} from .c to .h.
00054  * 
00055  * Revision 1.12  2005/06/29 23:48:04  egouvea
00056  * Revert changes: variables defined in cont_ad_base.c should not be accessible by the application
00057  *
00058  * Revision 1.10  2005/02/13 01:29:48  rkm
00059  * Fixed cont_ad_read to never cross sil/speech boundary, and rawmode
00060  *
00061  * Revision 1.9  2005/02/01 22:21:19  rkm
00062  * Added raw data logging, and raw data pass-through mode to cont_ad
00063  *
00064  * Revision 1.8  2004/07/23 23:36:34  egouvea
00065  * Ravi's merge, with the latest fixes in the FSG code, and making the log files generated by FSG, LM, and allphone have the same 'look and feel', with the backtrace information presented consistently
00066  *
00067  * 23-Jul-2004  M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
00068  *              Changed default adapt_rate from 0.5 to 0.2.
00069  *
00070  * Revision 1.7  2004/07/16 00:57:12  egouvea
00071  * Added Ravi's implementation of FSG support.
00072  *
00073  * Revision 1.2  2004/06/23 20:31:18  rkm
00074  * Added adapt_rate parameter; restructured frame processing to include threshold update
00075  *
00076  * 
00077  * 23-Oct-98    M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
00078  *              Small change in the way the noiselevel is updated in find_thresh().
00079  * 
00080  * 26-Aug-98    M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
00081  *              Separated computation of "frame power" into a separate low-level
00082  *              function.
00083  * 
00084  * 13-Jul-98    M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
00085  *              Modified to allow frame size to depend on audio sampling rate.
00086  * 
00087  * 01-Jul-98    M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
00088  *              Changed CONT_AD_DELTA_SPEECH back to 20.
00089  * 
00090  * 30-Jun-98    M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
00091  *              Changed CONT_AD_DELTA_SPEECH from 10 to 15.
00092  *              Added FILE* argument to cont_ad_powhist_dump().
00093  * 
00094  * 19-Jun-98    M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
00095  *              Changed CONT_AD_DELTA_SPEECH from 20 to 10, to increase sensitivity
00096  *              to very short utterances.
00097  * 
00098  * 16-Jan-98    Paul Placeway (pwp@cs.cmu.edu) at Carnegie Mellon University
00099  *              Changed to use dB instead of the weird power measure.
00100  *              Changed analysis window size, tuned default settings of most
00101  *              parameters to make the system less sensitive to noise, changed
00102  *              the histogram update frequency and decay to make the system
00103  *              adapt more rapidly to changes in the environment.
00104  *              Added cont_ad_set_params() and cont_ad_get_params().
00105  * 
00106  * 28-Jul-96    M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
00107  *              Added FRMPOW2SIGLVL, max_siglvl(), and cont_ad_t.siglvl.
00108  *              Changed min signal energy/frame to CONT_AD_SPF.
00109  * 
00110  * 27-Jun-96    M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
00111  *              Added the option for cont_ad_read to return -1 on EOF.
00112  * 
00113  * 21-Jun-96    M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
00114  *              Added cont_ad_set_thresh().
00115  *              Bugfix: n_other is recomputed after updating thresholds.
00116  * 
00117  * 20-Jun-96    M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
00118  *              Separated thresholds for speech and silence.
00119  *              Fixed bug in moving analysis window upon transition to speech state.
00120  * 
00121  * 17-Jun-96    M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
00122  *              Created, based loosely on Steve Reed's original implementation.
00123  */
00124 
00125 /*
00126  * This module is intended to be interposed as a filter between any raw A/D source and the
00127  * application to remove silence regions.  It is initialized with a raw A/D source function
00128  * (during the cont_ad_init call).  Filtered A/D data can be read by the application using
00129  * the cont_ad_read function.  This module assumes that the A/D source function supplies an
00130  * endless stream of data.  The application is responsible for setting up the A/D source,
00131  * turning recording on and off as it desires.  It is also responsible for invoking the
00132  * cont_ad_read function frequently enough to avoid buffer overruns and dropping A/D data.
00133  * This continuous listening module has an internal buffer of about 4 sec.
00134  * 
00135  * This module must be initialized and calibrated at first (cont_ad_init and cont_ad_calib
00136  * functions).  Raw samples are grouped into frames, the signal power in each frame is
00137  * computed and accumulated in a histogram.  The module is always in one of two states:
00138  * SILENCE or SPEECH.  Transitions between the two states are detected by looking for a
00139  * contiguous window of several frames that is predominantly of the other type.  The type
00140  * is determined by comparing frame power to either of two thresholds, thresh_sil and
00141  * thresh_speech, as appropriate for the current state.  These thresholds are set from the
00142  * first peak in the low-end of the power histogram, and are updated every few seconds.
00143  * Separate thresholds are used to provide some hysteresis.
00144  * 
00145  * The module maintains a linked list of speech (non-silence) segments not yet read by the
00146  * application.  The cont_ad_read function returns speech data, if any available, by
00147  * following this list.  It also updates an "absolute" timestamp at the end of the
00148  * cont_ad_read operation.  The timestamp indicates the total #samples of A/D data read
00149  * until this point, including data discarded as silence frames.  The application is
00150  * responsible for using this timestamp to make any policy decisions regarding utterance
00151  * boundaries or whatever.
00152  */
00153 
00154 #include <stdio.h>
00155 #include <stdlib.h>
00156 #include <string.h>
00157 #include <assert.h>
00158 #include <math.h>
00159 
00160 #ifdef HAVE_CONFIG_H
00161 #include <config.h>
00162 #endif
00163 
00164 #ifdef _MSC_VER
00165 #pragma warning (disable: 4305)
00166 #endif
00167 
00168 #include "sphinxbase/prim_type.h"
00169 #include "sphinxbase/ad.h"
00170 #include "sphinxbase/cont_ad.h"
00171 #include "sphinxbase/err.h"
00172 
00173 
00174 #ifndef _ABS
00175 #define _ABS(x) ((x) >= 0 ? (x) : -(x))
00176 #endif
00177 
00178 
00179 /* Various parameters, including defaults for many cont_ad_t member variables */
00180 
00181 #define CONT_AD_ADFRMSIZE       256     /* #Frames of internal A/D buffer maintained */
00182 
00183 #define CONT_AD_POWHISTSIZE     98      /* #Powhist bins: ~ FRMPOW(65536^2*CONT_AD_SPF) */
00184 /* Maximum level is 96.3 dB full-scale; 97 for safety, plus 1 for zero-based */
00185 
00186 #define CONT_AD_CALIB_FRAMES    (CONT_AD_POWHISTSIZE * 2)
00187 
00188 #define CONT_AD_THRESH_UPDATE   100     /* Update thresholds approx every so many frames */
00189         /* PWP: update was 200 frames, or 3.2 seconds.  Now about every 1.6 sec. */
00190 
00191 #define CONT_AD_ADAPT_RATE      0.2     /* Interpolation of new and old noiselevel */
00192 
00193 #define CONT_AD_SPS             16000
00194 
00195 #define CONT_AD_DEFAULT_NOISE   30      /* Default background noise power level */
00196 #define CONT_AD_DELTA_SIL       10      /* Initial default for cont_ad_t.delta_sil */
00197 #define CONT_AD_DELTA_SPEECH    17      /* Initial default for cont_ad_t.delta_speech */
00198 #define CONT_AD_MIN_NOISE       2       /* Expected minimum background noise level */
00199 #define CONT_AD_MAX_NOISE       70      /* Maximum background noise level */
00200 
00201 #define CONT_AD_HIST_INERTIA    3       /* Used in decaying the power histogram */
00202 
00203 #define CONT_AD_WINSIZE         21      /* Analysis window for state transitions */
00204                                 /* rkm had 16 */
00205 
00206 #define CONT_AD_SPEECH_ONSET    9       /* Min #speech frames in analysis window for
00207                                            SILENCE -> SPEECH state transition */
00208 /*
00209  * SReed had 100 ms == 6.25 fr contiguous; rkm had 9 (out of 16+10) with a
00210  * lower threshold.
00211  */
00212 
00213 #define CONT_AD_SIL_ONSET       18      /* Min #silence frames in analysis window for
00214                                            SPEECH -> SILENCE state transition
00215                                            MUST BE <= CONT_AD_WINSIZE */
00216 /*
00217  * SReed had 400 ms == 25 fr contiguous; rkm had 14 out of 16
00218  */
00219 
00220 #define CONT_AD_LEADER          5       /* On transition to SPEECH state, so many frames
00221                                            BEFORE window included in speech data (>0) */
00222                                 /* SReed had 200 ms == 12.5 fr; rkm had 5 */
00223 
00224 #define CONT_AD_TRAILER         10      /* On transition to SILENCE state, so many frames
00225                                            of silence included in speech data (>0).
00226                                            NOTE: Ensure (0 < TRAILER+LEADER <= WINSIZE) */
00227                                 /* SReed had 100 ms == 6.25 fr; rkm had 10 */
00228 
00229 
00230 void
00231 cont_ad_powhist_dump(FILE * fp, cont_ad_t * r)
00232 {
00233     int32 i, j;
00234 
00235     fprintf(fp, "PowHist:\n");
00236     for (i = 0, j = 0; i < CONT_AD_POWHISTSIZE; i++) {
00237         if (r->pow_hist[i] > 0) {
00238             fprintf(fp, "\t%3d %6d\n", i, r->pow_hist[i]);
00239             j = i;
00240         }
00241     }
00242 
00243     fprintf(fp, "PH[%7.2f]:",
00244             (double) (r->tot_frm * r->spf) / (double) (r->sps));
00245     for (i = 0; i <= j; i++)
00246         fprintf(fp, " %2d", r->pow_hist[i]);
00247     fprintf(fp, "\n");
00248 
00249     fflush(fp);
00250 }
00251 
00252 
00253 /*
00254  * Compute frame power.  Interface deliberately kept low level to allow arbitrary
00255  * users to call this function with appropriate data.
00256  */
00257 int32
00258 cont_ad_frame_pow(int16 * buf, int32 * prev, int32 spf)
00259 {
00260     double sumsq, v;
00261     int32 i;
00262     int32 p;
00263 
00264     sumsq = 0.0;
00265     p = *prev;
00266     for (i = 0; i < spf; i++) {
00267         /* Note: pre-emphasis done to remove low-frequency noise. */
00268         v = (double) (buf[i] - p);
00269         sumsq += v * v;
00270         p = buf[i];
00271     }
00272     *prev = p;
00273 
00274     if (sumsq < spf)            /* Make sure FRMPOW(sumsq) >= 0 */
00275         sumsq = spf;
00276 
00277     /*
00278      * PWP: Units changed to dB
00279      *
00280      * Now the units of measurement of an input sample are volts (really!),
00281      * so the power in dB is p = 20*log10(samp).  Further, we want the RMS
00282      * (root-mean-squared) average power across the frame.
00283      *
00284      * "sumsq" is the sum of the sum of the squares, so we want
00285      *
00286      *   i = 20 * log10( sqrt ( sumsq / n_samps) )
00287      *
00288      * (Stephen Reed's code actually had 
00289      *    i = 20 * log10( sqrt (sumsq) / n_samps )
00290      *  but this only produced an additive error.)
00291      *
00292      * i = 20 * log10( sqrt ( sumsq / n_samps) )
00293      *   = 20 * log10( ( sumsq / n_samps) ^ 0.5 )
00294      *   = 20 * log10( ( sumsq / n_samps) ) * 0.5 )
00295      *   = 10 * log10( ( sumsq / n_samps) )
00296      *   = 10 * ( log10( sumsq) - log10(n_samps) )
00297      */
00298     i = (int32) ((10.0 * (log10(sumsq) - log10((double) spf))) + 0.5);
00299     if (i < 0)
00300         i = 0;                  /* trim lower bound again to be safe. */
00301     assert(i < 97);
00302 
00303     return (i);
00304 }
00305 
00306 
00307 /*
00308  * Classify frame (id=frm, starting at sample position s) as sil/nonsil.  Classification
00309  * done in isolation, independent of any other frame, based only on power histogram.
00310  */
00311 static void
00312 compute_frame_pow(cont_ad_t * r, int32 frm)
00313 {
00314     int32 i;
00315 
00316     i = cont_ad_frame_pow(r->adbuf + (frm * r->spf), &(r->prev_sample),
00317                           r->spf);
00318 
00319     r->frm_pow[frm] = (char) i;
00320     (r->pow_hist[i])++;
00321     r->thresh_update--;
00322 }
00323 
00324 
00325 /* PWP: $$$ check this */
00326 /*
00327  * PWP: in SReed's code, decay was done by zeroing the histogram,
00328  * i.e. no history.
00329  */
00330 static void
00331 decay_hist(cont_ad_t * r)
00332 {
00333     int32 i;
00334 
00335     for (i = 0; i < CONT_AD_POWHISTSIZE; i++)
00336         r->pow_hist[i] -= (r->pow_hist[i] >> CONT_AD_HIST_INERTIA);
00337 }
00338 
00339 
00340 /*
00341  * Find silence threshold from power histogram.
00342  */
00343 static int32
00344 find_thresh(cont_ad_t * r)
00345 {
00346     int32 i, j, max, th;
00347     int32 old_noise_level, old_thresh_sil, old_thresh_speech;
00348 
00349     if (!r->auto_thresh)
00350         return 0;
00351 
00352     /*
00353      * Find smallest non-zero histogram entry, but starting at some minimum power.
00354      * Power lower than CONT_AD_MIN_NOISE indicates bad A/D input (eg, mic off...).
00355      * Too high a minimum power is also bad.
00356      */
00357     for (i = r->min_noise;
00358          (i < CONT_AD_POWHISTSIZE) && (r->pow_hist[i] == 0); i++);
00359     if (i > r->max_noise)       /* Bad signal? */
00360         return -1;
00361 
00362     /* PWP: Hmmmmm.... SReed's code looks over the lower 20 dB */
00363     /* PWP: 1/14/98  Made to work like Stephen Reed's code */
00364 
00365     /* This method of detecting the noise level is VERY unsatisfactory */
00366     max = 0;
00367     for (j = i, th = i; (j < CONT_AD_POWHISTSIZE) && (j < i + 20); j++) {       /* PWP: was i+6, which was 9 dB */
00368         if (max < r->pow_hist[j]) {
00369             max = r->pow_hist[j];
00370             th = j;
00371         }
00372     }
00373 
00374     /* "Don't change the threshold too fast" */
00375     old_noise_level = r->noise_level;
00376     old_thresh_sil = r->thresh_sil;
00377     old_thresh_speech = r->thresh_speech;
00378     /* r->noise_level = (int32) (th * r->adapt_rate + r->noise_level * (1.0 - r->adapt_rate)); */
00379     r->noise_level =
00380         (int32) (r->noise_level +
00381                  r->adapt_rate * (th - r->noise_level) + 0.5);
00382 
00383     /* update thresholds */
00384     r->thresh_sil = r->noise_level + r->delta_sil;
00385     r->thresh_speech = r->noise_level + r->delta_speech;
00386 
00387     if (r->logfp) {
00388         fprintf(r->logfp,
00389                 "%7.2fs %8df: NoisePeak: %d, Noiselevel: %d -> %d, Th-Sil: %d -> %d, Th-Sp: %d -> %d\n",
00390                 (double) (r->tot_frm * r->spf) / (double) (r->sps),
00391                 r->tot_frm, th, old_noise_level, r->noise_level,
00392                 old_thresh_sil, r->thresh_sil, old_thresh_speech,
00393                 r->thresh_speech);
00394 
00395         cont_ad_powhist_dump(r->logfp, r);
00396 
00397         fflush(r->logfp);
00398     }
00399 
00400     /*
00401      * PWP: in SReed's original, he cleared the histogram here.
00402      * I can't fathom why.
00403      */
00404 
00405     return 0;
00406 }
00407 
00408 
00409 /*
00410  * Silence to speech transition
00411  */
00412 static void
00413 sil2speech_transition(cont_ad_t *r, int frm)
00414 {
00415     spseg_t *seg;
00416 
00417     /* Speech detected; create speech segment description */
00418     seg = malloc(sizeof(*seg));
00419 
00420     seg->startfrm = r->win_startfrm - r->leader;
00421     if (seg->startfrm < 0)
00422         seg->startfrm += CONT_AD_ADFRMSIZE;
00423     seg->nfrm = r->leader + r->winsize;
00424     seg->next = NULL;
00425 
00426     if (!r->spseg_head)
00427         r->spseg_head = seg;
00428     else
00429         r->spseg_tail->next = seg;
00430     r->spseg_tail = seg;
00431 
00432     r->tail_state = CONT_AD_STATE_SPEECH;
00433 
00434     if (r->logfp) {
00435         int32 n;
00436 
00437         /* Where (in absolute time) this speech segment starts */
00438         n = frm - seg->startfrm;
00439         if (n < 0)
00440             n += CONT_AD_ADFRMSIZE;
00441         n = r->tot_frm - n - 1;
00442 
00443         fprintf(r->logfp,
00444                 "%7.2fs %8d[%3d]f: Sil -> Sp detect; seg start: %7.2fs %8d\n",
00445                 (double) (r->tot_frm *
00446                           r->spf) /
00447                 (double) (r->sps),
00448                 r->tot_frm, frm,
00449                 (double) (n * r->spf) / (double) (r->sps), n);
00450     }
00451 
00452     /* Now in SPEECH state; want to look for silence from end of this window */
00453     r->win_validfrm = 1;
00454     r->win_startfrm = frm;
00455 
00456     /* Count #sil frames remaining in reduced window (of 1 frame) */
00457     r->n_other = (r->frm_pow[frm] <= r->thresh_sil) ? 1 : 0;
00458 }
00459 
00460 /*
00461  * Speech to silence transition
00462  */
00463 static void
00464 speech2sil_transition(cont_ad_t *r, int frm)
00465 {
00466     int f;
00467 
00468     /* End of speech detected; speech->sil transition */
00469     r->spseg_tail->nfrm += r->trailer;
00470 
00471     r->tail_state = CONT_AD_STATE_SIL;
00472 
00473     if (r->logfp) {
00474         int32 n;
00475 
00476         /* Where (in absolute time) this speech segment ends */
00477         n = r->spseg_tail->startfrm + r->spseg_tail->nfrm - 1;
00478         if (n >= CONT_AD_ADFRMSIZE)
00479             n -= CONT_AD_ADFRMSIZE;
00480         n = frm - n;
00481         if (n < 0)
00482             n += CONT_AD_ADFRMSIZE;
00483         n = r->tot_frm - n;
00484 
00485         fprintf(r->logfp,
00486                 "%7.2fs %8d[%3d]f: Sp -> Sil detect; seg end: %7.2fs %8d\n",
00487                 (double) (r->tot_frm * r->spf) /
00488                 (double) (r->sps), r->tot_frm, frm,
00489                 (double) (n * r->spf) / (double) (r->sps), n);
00490     }
00491 
00492     /* Now in SILENCE state; start looking for speech trailer+leader frames later */
00493     r->win_validfrm -= (r->trailer + r->leader - 1);
00494     r->win_startfrm += (r->trailer + r->leader - 1);
00495     if (r->win_startfrm >= CONT_AD_ADFRMSIZE)
00496         r->win_startfrm -= CONT_AD_ADFRMSIZE;
00497 
00498     /* Count #speech frames remaining in reduced window */
00499     r->n_other = 0;
00500     for (f = r->win_startfrm;;) {
00501         if (r->frm_pow[f] >= r->thresh_speech)
00502             r->n_other++;
00503 
00504         if (f == frm)
00505             break;
00506 
00507         f++;
00508         if (f >= CONT_AD_ADFRMSIZE)
00509             f = 0;
00510     }
00511 }
00512 
00513 
00514 /*
00515  * Main silence/speech region detection routine.  If currently in
00516  * SILENCE state, switch to SPEECH state if a window (r->winsize)
00517  * of frames is mostly non-silence.  If in SPEECH state, switch to
00518  * SILENCE state if the window is mostly silence.
00519  */
00520 static void
00521 boundary_detect(cont_ad_t * r, int32 frm)
00522 {
00523     assert(r->n_other >= 0);
00524 
00525     r->win_validfrm++;
00526     if (r->tail_state == CONT_AD_STATE_SIL) {
00527         if (r->frm_pow[frm] >= r->thresh_speech)
00528             r->n_other++;
00529     }
00530     else {
00531         if (r->frm_pow[frm] <= r->thresh_sil)
00532             r->n_other++;
00533     }
00534 
00535     if (r->logfp) {
00536         fprintf(r->logfp,
00537                 "%7.2fs %8d[%3d]f: P: %2d, N: %2d, T+: %2d, T-: %2d, #O: %2d, %s\n",
00538                 (double) (r->tot_frm * r->spf) / (double) (r->sps),
00539                 r->tot_frm, frm, r->frm_pow[frm], r->noise_level,
00540                 r->thresh_speech, r->thresh_sil, r->n_other,
00541                 (r->tail_state == CONT_AD_STATE_SIL) ? "--" : "Sp");
00542     }
00543 
00544     if (r->win_validfrm < r->winsize)   /* Not reached full analysis window size */
00545         return;
00546     assert(r->win_validfrm == r->winsize);
00547 
00548     if (r->tail_state == CONT_AD_STATE_SIL) {   /* Currently in SILENCE state */
00549         if (r->n_frm >= r->winsize + r->leader
00550             && r->n_other >= r->speech_onset) {
00551             sil2speech_transition(r, frm);
00552         }
00553     }
00554     else {
00555         if (r->n_other >= r->sil_onset) {
00556             speech2sil_transition(r, frm);
00557         }
00558         else {
00559             /* In speech state, and staying there; add this frame to segment */
00560             r->spseg_tail->nfrm++;
00561         }
00562     }
00563 
00564     /*
00565      * Get rid of oldest frame in analysis window.  Not quite correct;
00566      * thresholds could have changed over the window; should preserve
00567      * the original speech/silence label for the frame and undo it.  Later..
00568      */
00569     if (r->tail_state == CONT_AD_STATE_SIL) {
00570         if (r->frm_pow[r->win_startfrm] >= r->thresh_speech) {
00571             if (r->n_other > 0)
00572                 r->n_other--;
00573         }
00574     }
00575     else {
00576         if (r->frm_pow[r->win_startfrm] <= r->thresh_sil) {
00577             if (r->n_other > 0)
00578                 r->n_other--;
00579         }
00580     }
00581     r->win_validfrm--;
00582     r->win_startfrm++;
00583     if (r->win_startfrm >= CONT_AD_ADFRMSIZE)
00584         r->win_startfrm = 0;
00585 
00586     if (r->logfp)
00587         fflush(r->logfp);
00588 }
00589 
00590 
00591 static int32
00592 max_siglvl(cont_ad_t * r, int32 startfrm, int32 nfrm)
00593 {
00594     int32 siglvl, i, f;
00595 
00596     siglvl = 0;
00597     if (nfrm > 0) {
00598         for (i = 0, f = startfrm; i < nfrm; i++, f++) {
00599             if (f >= CONT_AD_ADFRMSIZE)
00600                 f -= CONT_AD_ADFRMSIZE;
00601             if (r->frm_pow[f] > siglvl)
00602                 siglvl = r->frm_pow[f];
00603         }
00604     }
00605     return siglvl;
00606 }
00607 
00608 
00609 #if 0
00610 /*
00611  * RKM(2005/01/31): Where did this come from?  If needed, it should be called
00612  * cont_ad_get_audio_data.
00613  */
00614 void
00615 get_audio_data(cont_ad_t * r, int16 * buf, int32 max)
00616 {
00617 }
00618 #endif
00619 
00620 
00621 static void
00622 cont_ad_read_log(cont_ad_t * r, int32 retval)
00623 {
00624     spseg_t *seg;
00625 
00626     fprintf(r->logfp, "return from cont_ad_read() -> %d:\n", retval);
00627     fprintf(r->logfp, "\tstate: %d\n", r->state);
00628     fprintf(r->logfp, "\tread_ts: %d (%.2fs)\n",
00629             r->read_ts, (float32) r->read_ts / (float32) r->sps);
00630     fprintf(r->logfp, "\tseglen: %d (%.2fs)\n",
00631             r->seglen, (float32) r->seglen / (float32) r->sps);
00632     fprintf(r->logfp, "\tsiglvl: %d\n", r->siglvl);
00633     fprintf(r->logfp, "\theadfrm: %d\n", r->headfrm);
00634     fprintf(r->logfp, "\tn_frm: %d\n", r->n_frm);
00635     fprintf(r->logfp, "\tn_sample: %d\n", r->n_sample);
00636     fprintf(r->logfp, "\twin_startfrm: %d\n", r->win_startfrm);
00637     fprintf(r->logfp, "\twin_validfrm: %d\n", r->win_validfrm);
00638     fprintf(r->logfp, "\tnoise_level: %d\n", r->noise_level);
00639     fprintf(r->logfp, "\tthresh_sil: %d\n", r->thresh_sil);
00640     fprintf(r->logfp, "\tthresh_speech: %d\n", r->thresh_speech);
00641     fprintf(r->logfp, "\tn_other: %d\n", r->n_other);
00642     fprintf(r->logfp, "\ttail_state: %d\n", r->tail_state);
00643     fprintf(r->logfp, "\ttot_frm: %d\n", r->tot_frm);
00644 
00645     fprintf(r->logfp, "\tspseg:");
00646     for (seg = r->spseg_head; seg; seg = seg->next)
00647         fprintf(r->logfp, " %d[%d]", seg->startfrm, seg->nfrm);
00648     fprintf(r->logfp, "\n");
00649 
00650     fflush(r->logfp);
00651 }
00652 
00653 
00654 /*
00655  * Copy data from r->adbuf[sf], for nf frames, into buf.
00656  * All length checks must have been completed before this call; hence, this
00657  * function will copy exactly the specified number of frames.
00658  * 
00659  * Return value: Index of frame just after the segment copied, possibly wrapped
00660  * around to 0.
00661  */
00662 static int32
00663 buf_copy(cont_ad_t * r, int32 sf, int32 nf, int16 * buf)
00664 {
00665     int32 f, l;
00666 
00667     assert((sf >= 0) && (sf < CONT_AD_ADFRMSIZE));
00668     assert(nf >= 0);
00669 
00670     if (sf + nf > CONT_AD_ADFRMSIZE) {
00671         /* Amount to be copied wraps around adbuf; copy in two stages */
00672         f = CONT_AD_ADFRMSIZE - sf;
00673         l = (f * r->spf);
00674         memcpy(buf, r->adbuf + (sf * r->spf), l * sizeof(int16));
00675 
00676         if (r->logfp) {
00677             fprintf(r->logfp,
00678                     "return %d speech frames [%d..%d]; %d samples\n",
00679                     f, sf, sf + f - 1, l);
00680         }
00681 
00682         buf += l;
00683         sf = 0;
00684         nf -= f;
00685     }
00686 
00687     if (nf > 0) {
00688         l = (nf * r->spf);
00689         memcpy(buf, r->adbuf + (sf * r->spf), l * sizeof(int16));
00690 
00691         if (r->logfp) {
00692             fprintf(r->logfp,
00693                     "return %d speech frames [%d..%d]; %d samples\n",
00694                     nf, sf, sf + nf - 1, l);
00695         }
00696     }
00697 
00698     if ((sf + nf) >= CONT_AD_ADFRMSIZE) {
00699         assert((sf + nf) == CONT_AD_ADFRMSIZE);
00700         return 0;
00701     }
00702     else
00703         return (sf + nf);
00704 }
00705 
00706 int32
00707 cont_ad_buffer_space(cont_ad_t *r)
00708 {
00709     return r->adbufsize - r->n_sample;
00710 }
00711 
00712 /*
00713  * Read as much data as possible from r->adfunc into r->adbuf.
00714  */
00715 static int32
00716 cont_ad_read_internal(cont_ad_t *r, int16 *buf, int32 max)
00717 {
00718     int32 head, tail, len, l;
00719 
00720     /*
00721      * First read as much of raw A/D as possible and available.  adbuf is not
00722      * really a circular buffer, so may have to read in two steps for wrapping
00723      * around.
00724      */
00725     head = r->headfrm * r->spf;
00726     tail = head + r->n_sample;
00727     len = r->n_sample - (r->n_frm * r->spf);    /* #partial frame samples at the tail */
00728     assert((len >= 0) && (len < r->spf));
00729 
00730     if ((tail < r->adbufsize) && (!r->eof)) {
00731         if (r->adfunc) {
00732             if ((l =
00733                  (*(r->adfunc)) (r->ad, r->adbuf + tail,
00734                                  r->adbufsize - tail)) < 0) {
00735                 r->eof = 1;
00736                 l = 0;
00737             }
00738         }
00739         else {
00740             l = r->adbufsize - tail;
00741             if (l > max) {
00742                 l = max;
00743                 max = 0;
00744             }
00745             else {
00746                 max -= l;
00747             }
00748             memcpy(r->adbuf + tail, buf, l * sizeof(int16));
00749             buf += l;
00750         }
00751         if ((l > 0) && r->rawfp) {
00752             fwrite(r->adbuf + tail, sizeof(int16), l, r->rawfp);
00753             fflush(r->rawfp);
00754         }
00755 
00756         tail += l;
00757         len += l;
00758         r->n_sample += l;
00759     }
00760     if ((tail >= r->adbufsize) && (!r->eof)) {
00761         tail -= r->adbufsize;
00762         if (tail < head) {
00763             if (r->adfunc) {
00764                 if ((l =
00765                      (*(r->adfunc)) (r->ad,
00766                                      r->adbuf + tail, head - tail)) < 0) {
00767                     r->eof = 1;
00768                     l = 0;
00769                 }
00770             }
00771             else {
00772                 l = head - tail;
00773                 if (l > max)
00774                     l = max;
00775                 memcpy(r->adbuf + tail, buf, l * sizeof(int16));
00776             }
00777             if ((l > 0) && r->rawfp) {
00778                 fwrite(r->adbuf + tail, sizeof(int16), l, r->rawfp);
00779                 fflush(r->rawfp);
00780             }
00781 
00782             tail += l;
00783             len += l;
00784             r->n_sample += l;
00785         }
00786     }
00787 
00788     return len;
00789 }
00790 
00791 /*
00792  * Classify incoming frames as silence or speech.
00793  */
00794 int32
00795 cont_ad_classify(cont_ad_t *r, int32 len)
00796 {
00797     int32 tailfrm;
00798 
00799     tailfrm = (r->headfrm + r->n_frm);  /* Next free frame slot to be filled */
00800     if (tailfrm >= CONT_AD_ADFRMSIZE)
00801         tailfrm -= CONT_AD_ADFRMSIZE;
00802 
00803     for (; len >= r->spf; len -= r->spf) {
00804         compute_frame_pow(r, tailfrm);
00805         r->n_frm++;
00806         r->tot_frm++;
00807 
00808         /*
00809          * Find speech/sil state change, if any.  Also, if staying in speech state
00810          * add this frame to current speech segment.
00811          */
00812         boundary_detect(r, tailfrm);
00813 
00814         if (++tailfrm >= CONT_AD_ADFRMSIZE)
00815             tailfrm = 0;
00816 
00817         /* Update thresholds if time to do so */
00818         if (r->thresh_update <= 0) {
00819             int32 i, f;
00820             find_thresh(r);
00821             decay_hist(r);
00822             r->thresh_update = CONT_AD_THRESH_UPDATE;
00823 
00824 #if 1
00825             /*
00826              * Since threshold has been updated, recompute r->n_other.
00827              * (RKM: Is this really necessary?  Comment out??)
00828              */
00829             r->n_other = 0;
00830             if (r->tail_state == CONT_AD_STATE_SIL) {
00831                 for (i = r->win_validfrm, f = r->win_startfrm; i > 0; --i) {
00832                     if (r->frm_pow[f] >= r->thresh_speech)
00833                         r->n_other++;
00834 
00835                     f++;
00836                     if (f >= CONT_AD_ADFRMSIZE)
00837                         f = 0;
00838                 }
00839             }
00840             else {
00841                 for (i = r->win_validfrm, f = r->win_startfrm; i > 0; --i) {
00842                     if (r->frm_pow[f] <= r->thresh_sil)
00843                         r->n_other++;
00844 
00845                     f++;
00846                     if (f >= CONT_AD_ADFRMSIZE)
00847                         f = 0;
00848                 }
00849             }
00850 #endif
00851         }
00852     }
00853 
00854     return r->tail_state;
00855 }
00856 
00857 /*
00858  * Main function called by the application to filter out silence regions.
00859  * Maintains a linked list of speech segments pointing into r->adbuf and feeds
00860  * data to application from them.
00861  */
00862 int32
00863 cont_ad_read(cont_ad_t * r, int16 * buf, int32 max)
00864 {
00865     int32 flen, len, retval, newstate;
00866     spseg_t *seg;
00867 
00868     if ((r == NULL) || (buf == NULL))
00869         return -1;
00870 
00871     if (max < r->spf) {
00872         E_ERROR
00873             ("cont_ad_read requires buffer of at least %d samples\n",
00874              r->spf);
00875         return -1;
00876     }
00877 
00878     if (r->logfp) {
00879         fprintf(r->logfp, "cont_ad_read(,, %d)\n", max);
00880         fflush(r->logfp);
00881     }
00882 
00883     /* Read data from adfunc or from buf. */
00884     len = cont_ad_read_internal(r, buf, max);
00885 
00886     /* Compute frame power for unprocessed+new data and find speech/silence boundaries */
00887     cont_ad_classify(r, len);
00888 
00889     /*
00890      * If eof on input data source, cleanup the final segment.
00891      */
00892     if (r->eof) {
00893         if (r->tail_state == CONT_AD_STATE_SPEECH) {
00894             /*
00895              * Still inside a speech segment when input data got over.  Absort any
00896              * remaining frames into the final speech segment.
00897              */
00898             assert(r->spseg_tail != NULL);
00899 
00900             /* Absorb frames still in analysis window into final speech seg */
00901             assert((r->win_validfrm >= 0)
00902                    && (r->win_validfrm < r->winsize));
00903             r->spseg_tail->nfrm += r->win_validfrm;
00904 
00905             r->tail_state = CONT_AD_STATE_SIL;
00906         }
00907 
00908         r->win_startfrm += r->win_validfrm;
00909         if (r->win_startfrm >= CONT_AD_ADFRMSIZE)
00910             r->win_startfrm -= CONT_AD_ADFRMSIZE;
00911         r->win_validfrm = 0;
00912         r->n_other = 0;
00913     }
00914 
00915     /*
00916      * At last ready to copy speech data, if any, into caller's buffer.  Raw
00917      * speech data is segmented into alternating speech and silence segments.
00918      * But any single call to cont_ad_read will never cross a speech/silence
00919      * boundary.
00920      */
00921     seg = r->spseg_head;        /* first speech segment available, if any */
00922 
00923     if ((seg == NULL) || (r->headfrm != seg->startfrm)) {
00924         /*
00925          * Either no speech data available, or inside a silence segment.  Find
00926          * length of silence segment.
00927          */
00928         if (seg == NULL) {
00929             assert(r->tail_state == CONT_AD_STATE_SIL);
00930 
00931             flen =
00932                 (r->eof) ? r->n_frm : r->n_frm - (r->winsize +
00933                                                   r->leader - 1);
00934             if (flen < 0)
00935                 flen = 0;
00936         }
00937         else {
00938             flen = seg->startfrm - r->headfrm;
00939             if (flen < 0)
00940                 flen += CONT_AD_ADFRMSIZE;
00941         }
00942 
00943         if (r->rawmode) {
00944             /* Restrict silence segment to user buffer size, integral #frames */
00945             int32 f = max / r->spf;
00946             if (flen > f)
00947                 flen = f;
00948         }
00949 
00950         newstate = CONT_AD_STATE_SIL;
00951     }
00952     else {
00953         flen = max / r->spf;    /* truncate read-size to integral #frames */
00954         if (flen > seg->nfrm)
00955             flen = seg->nfrm;   /* truncate further to this segment size */
00956 
00957         newstate = CONT_AD_STATE_SPEECH;
00958     }
00959 
00960     len = flen * r->spf;        /* #samples being consumed */
00961 
00962     r->siglvl = max_siglvl(r, r->headfrm, flen);
00963 
00964     if ((newstate == CONT_AD_STATE_SIL) && (!r->rawmode)) {
00965         /* Skip silence data */
00966         r->headfrm += flen;
00967         if (r->headfrm >= CONT_AD_ADFRMSIZE)
00968             r->headfrm -= CONT_AD_ADFRMSIZE;
00969 
00970         retval = 0;             /* #samples being copied/returned */
00971     }
00972     else {
00973         /* Copy speech/silence(in rawmode) data */
00974         r->headfrm = buf_copy(r, r->headfrm, flen, buf);
00975 
00976         retval = len;           /* #samples being copied/returned */
00977     }
00978 
00979     r->n_frm -= flen;
00980     r->n_sample -= len;
00981     assert((r->n_frm >= 0) && (r->n_sample >= 0));
00982     assert(r->win_validfrm <= r->n_frm);
00983 
00984     if (r->state == newstate)
00985         r->seglen += len;
00986     else
00987         r->seglen = len;
00988     r->state = newstate;
00989 
00990     if (newstate == CONT_AD_STATE_SPEECH) {
00991         seg->startfrm = r->headfrm;
00992         assert(seg->startfrm >= 0);
00993         seg->nfrm -= flen;
00994 
00995         /* Free seg if empty and not recording into it */
00996         if ((seg->nfrm == 0)
00997             && (seg->next || (r->tail_state == CONT_AD_STATE_SIL))) {
00998             r->spseg_head = seg->next;
00999             if (seg->next == NULL)
01000                 r->spseg_tail = NULL;
01001             free(seg);
01002         }
01003     }
01004 
01005     /* Update timestamp.  Total raw A/D read - those remaining to be consumed */
01006     r->read_ts = (r->tot_frm - r->n_frm) * r->spf;
01007 
01008     if (retval == 0)
01009         retval = (r->eof && (r->spseg_head == NULL)) ? -1 : 0;
01010 
01011     if (r->logfp)
01012         cont_ad_read_log(r, retval);
01013 
01014     return retval;
01015 }
01016 
01017 
01018 /*
01019  * Calibrate input channel for silence threshold.
01020  */
01021 int32
01022 cont_ad_calib(cont_ad_t * r)
01023 {
01024     int32 i, s, k, len, tailfrm;
01025 
01026     if (r == NULL)
01027         return -1;
01028 
01029     /* clear histogram */
01030     for (i = 0; i < CONT_AD_POWHISTSIZE; i++)
01031         r->pow_hist[i] = 0;
01032     tailfrm = r->headfrm + r->n_frm;
01033     if (tailfrm >= CONT_AD_ADFRMSIZE)
01034         tailfrm -= CONT_AD_ADFRMSIZE;
01035     s = (tailfrm * r->spf);
01036 
01037     for (r->n_calib_frame = 0;
01038          r->n_calib_frame < CONT_AD_CALIB_FRAMES;
01039          ++r->n_calib_frame) {
01040         len = r->spf;
01041         while (len > 0) {
01042             /*Trouble */
01043             if ((k = (*(r->adfunc)) (r->ad, r->adbuf + s, len)) < 0)
01044                 return -1;
01045             len -= k;
01046             s += k;
01047         }
01048         s -= r->spf;
01049 
01050         compute_frame_pow(r, tailfrm);
01051     }
01052 
01053     r->thresh_update = CONT_AD_THRESH_UPDATE;
01054     return find_thresh(r);
01055 }
01056 
01057 int32
01058 cont_ad_calib_size(cont_ad_t *r)
01059 {
01060     return r->spf * CONT_AD_CALIB_FRAMES;
01061 }
01062 
01063 int32
01064 cont_ad_calib_loop(cont_ad_t * r, int16 * buf, int32 max)
01065 {
01066     int32 i, s, len, tailfrm;
01067 
01068     if (r->n_calib_frame == CONT_AD_CALIB_FRAMES) {
01069         /* If calibration previously succeeded, then this is a
01070          * recalibration, so start again. */
01071         r->n_calib_frame = 0;
01072         /* clear histogram */
01073         for (i = 0; i < CONT_AD_POWHISTSIZE; i++)
01074             r->pow_hist[i] = 0;
01075     }
01076 
01077     tailfrm = r->headfrm + r->n_frm;
01078     if (tailfrm >= CONT_AD_ADFRMSIZE)
01079         tailfrm -= CONT_AD_ADFRMSIZE;
01080     s = (tailfrm * r->spf);
01081 
01082     len = r->spf;
01083     for (; r->n_calib_frame < CONT_AD_CALIB_FRAMES;
01084          ++r->n_calib_frame) {
01085         if (max < len)
01086             return 1;
01087         memcpy(r->adbuf + s, buf, len * sizeof(int16));
01088         max -= len;
01089         buf += len;
01090         compute_frame_pow(r, tailfrm);
01091     }
01092 
01093     r->thresh_update = CONT_AD_THRESH_UPDATE;
01094     return find_thresh(r);
01095 }
01096 
01097 
01098 /* PWP 1/14/98 -- modified for compatibility with old code */
01099 int32
01100 cont_ad_set_thresh(cont_ad_t * r, int32 sil, int32 speech)
01101 {
01102     if (r == NULL)
01103         return -1;
01104 
01105     if ((sil < 0) || (speech < 0)) {
01106         fprintf(stderr,
01107                 "cont_ad_set_thresh: invalid threshold arguments: %d, %d\n",
01108                 sil, speech);
01109         return -1;
01110     }
01111     r->delta_sil = (3 * sil) / 2;
01112     r->delta_speech = (3 * speech) / 2;
01113 
01114     return 0;
01115 }
01116 
01117 
01118 /*
01119  * PWP 1/14/98 -- set the changable params.
01120  *
01121  *   delta_sil, delta_speech, min_noise, and max_noise are in dB,
01122  *   winsize, speech_onset, sil_onset, leader and trailer are in frames of
01123  *   16 ms length (256 samples @ 16kHz sampling).
01124  */
01125 int32
01126 cont_ad_set_params(cont_ad_t * r, int32 delta_sil,
01127                    int32 delta_speech, int32 min_noise,
01128                    int32 max_noise, int32 winsize,
01129                    int32 speech_onset, int32 sil_onset, int32 leader,
01130                    int32 trailer, float32 adapt_rate)
01131 {
01132     if ((delta_sil < 0) || (delta_speech < 0) || (min_noise < 0)
01133         || (max_noise < 0)) {
01134         E_ERROR("threshold arguments: "
01135                 "%d, %d, %d, %d must all be >=0\n", delta_sil,
01136                 delta_speech, min_noise, max_noise);
01137         return -1;
01138     }
01139 
01140     if ((speech_onset > winsize) || (speech_onset <= 0)
01141         || (winsize <= 0)) {
01142         E_ERROR
01143             ("speech_onset, %d, must be <= winsize, %d, and both >0\n",
01144              speech_onset, winsize);
01145         return -1;
01146     }
01147 
01148     if ((sil_onset > winsize) || (sil_onset <= 0) || (winsize <= 0)) {
01149         E_ERROR
01150             ("sil_onset, %d, must be <= winsize, %d, and both >0\n",
01151              sil_onset, winsize);
01152         return -1;
01153     }
01154 
01155     if (((leader + trailer) > winsize) || (leader <= 0)
01156         || (trailer <= 0)) {
01157         E_ERROR
01158             ("leader, %d, plus trailer, %d, must be <= winsize, %d, and both >0\n",
01159              leader, trailer, winsize);
01160         return -1;
01161     }
01162 
01163     if ((adapt_rate < 0.0) || (adapt_rate > 1.0)) {
01164         E_ERROR("adapt_rate, %e; must be in range 0..1\n", adapt_rate);
01165         return -1;
01166     }
01167 
01168     if (r == NULL)
01169         return -1;
01170 
01171     r->delta_sil = delta_sil;
01172     r->delta_speech = delta_speech;
01173     r->min_noise = min_noise;
01174     r->max_noise = max_noise;
01175 
01176     r->winsize = winsize;
01177     r->speech_onset = speech_onset;
01178     r->sil_onset = sil_onset;
01179     r->leader = leader;
01180     r->trailer = trailer;
01181 
01182     r->adapt_rate = adapt_rate;
01183 
01184     if (r->win_validfrm >= r->winsize)
01185         r->win_validfrm = r->winsize - 1;
01186 
01187     return 0;
01188 }
01189 
01190 
01191 /*
01192  * PWP 1/14/98 -- get the changable params.
01193  *
01194  *   delta_sil, delta_speech, min_noise, and max_noise are in dB,
01195  *   winsize, speech_onset, sil_onset, leader and trailer are in frames of
01196  *   16 ms length (256 samples @ 16kHz sampling).
01197  */
01198 int32
01199 cont_ad_get_params(cont_ad_t * r, int32 * delta_sil,
01200                    int32 * delta_speech, int32 * min_noise,
01201                    int32 * max_noise, int32 * winsize,
01202                    int32 * speech_onset, int32 * sil_onset,
01203                    int32 * leader, int32 * trailer, float32 * adapt_rate)
01204 {
01205     if (!delta_sil || !delta_speech || !min_noise || !max_noise
01206         || !winsize || !speech_onset || !sil_onset || !leader
01207         || !trailer || !adapt_rate) {
01208         fprintf(stderr, "cont_ad_get_params: some param slots are NULL\n");
01209         return (-1);
01210     }
01211 
01212     if (r == NULL)
01213         return -1;
01214 
01215     *delta_sil = r->delta_sil;
01216     *delta_speech = r->delta_speech;
01217     *min_noise = r->min_noise;
01218     *max_noise = r->max_noise;
01219 
01220     *winsize = r->winsize;
01221     *speech_onset = r->speech_onset;
01222     *sil_onset = r->sil_onset;
01223     *leader = r->leader;
01224     *trailer = r->trailer;
01225 
01226     *adapt_rate = r->adapt_rate;
01227 
01228     return 0;
01229 }
01230 
01231 
01232 /*
01233  * Reset, discarded any accumulated speech.
01234  */
01235 int32
01236 cont_ad_reset(cont_ad_t * r)
01237 {
01238     spseg_t *seg;
01239 
01240     if (r == NULL)
01241         return -1;
01242 
01243     while (r->spseg_head) {
01244         seg = r->spseg_head;
01245         r->spseg_head = seg->next;
01246         free(seg);
01247     }
01248     r->spseg_tail = NULL;
01249 
01250     r->headfrm = 0;
01251     r->n_frm = 0;
01252     r->n_sample = 0;
01253     r->win_startfrm = 0;
01254     r->win_validfrm = 0;
01255     r->n_other = 0;
01256 
01257     r->tail_state = CONT_AD_STATE_SIL;
01258 
01259     return 0;
01260 }
01261 
01262 
01263 int32
01264 cont_ad_close(cont_ad_t * cont)
01265 {
01266     if (cont == NULL)
01267         return -1;
01268 
01269     cont_ad_reset(cont);        /* Frees any remaining speech segments */
01270 
01271     free(cont->adbuf);
01272     free(cont->pow_hist);
01273     free(cont->frm_pow);
01274     free(cont);
01275 
01276     return 0;
01277 }
01278 
01279 
01280 int32
01281 cont_ad_detach(cont_ad_t * c)
01282 {
01283     if (c == NULL)
01284         return -1;
01285 
01286     c->ad = NULL;
01287     c->adfunc = NULL;
01288     return 0;
01289 }
01290 
01291 
01292 int32
01293 cont_ad_attach(cont_ad_t * c, ad_rec_t * a,
01294                int32(*func) (ad_rec_t *, int16 *, int32))
01295 {
01296     if (c == NULL)
01297         return -1;
01298 
01299     c->ad = a;
01300     c->adfunc = func;
01301     c->eof = 0;
01302 
01303     return 0;
01304 }
01305 
01306 
01307 int32
01308 cont_set_thresh(cont_ad_t * r, int32 silence, int32 speech)
01309 {
01310     int32 i, f;
01311 
01312     r->thresh_speech = speech;
01313     r->thresh_sil = silence;
01314 
01315     /* Since threshold has been updated, recompute r->n_other */
01316     r->n_other = 0;
01317     if (r->tail_state == CONT_AD_STATE_SIL) {
01318         for (i = r->win_validfrm, f = r->win_startfrm; i > 0; --i) {
01319             if (r->frm_pow[f] >= r->thresh_speech)
01320                 r->n_other++;
01321 
01322             f++;
01323             if (f >= CONT_AD_ADFRMSIZE)
01324                 f = 0;
01325         }
01326     }
01327     else if (r->tail_state == CONT_AD_STATE_SPEECH) {
01328         for (i = r->win_validfrm, f = r->win_startfrm; i > 0; --i) {
01329             if (r->frm_pow[f] <= r->thresh_sil)
01330                 r->n_other++;
01331 
01332             f++;
01333             if (f >= CONT_AD_ADFRMSIZE)
01334                 f = 0;
01335         }
01336     }
01337 
01338     return 0;
01339 }
01340 
01341 
01342 /*
01343  * Set the file pointer for dumping the raw input audio stream.
01344  */
01345 int32
01346 cont_ad_set_rawfp(cont_ad_t * r, FILE * fp)
01347 {
01348     if (r == NULL)
01349         return -1;
01350 
01351     r->rawfp = fp;
01352     return 0;
01353 }
01354 
01355 
01356 /*
01357  * Set the file pointer for logging cont_ad progress.
01358  */
01359 int32
01360 cont_ad_set_logfp(cont_ad_t * r, FILE * fp)
01361 {
01362     if (r == NULL)
01363         return -1;
01364 
01365     r->logfp = fp;
01366     return 0;
01367 }
01368 
01369 
01370 /*
01371  * One-time initialization.
01372  */
01373 cont_ad_t *
01374 cont_ad_init(ad_rec_t * a, int32(*func) (ad_rec_t *, int16 *, int32))
01375 {
01376     cont_ad_t *r;
01377 
01378     if ((r = malloc(sizeof(*r))) == NULL) {
01379                 E_ERROR_SYSTEM("allocation of cont_ad_t failed");
01380         return NULL;
01381     }
01382 
01383     r->ad = a;
01384     r->adfunc = func;
01385     r->eof = 0;
01386     r->rawmode = 0;
01387 
01388     if (a != NULL)
01389         r->sps = a->sps;
01390     else
01391         r->sps = CONT_AD_SPS;
01392 
01393     /* Set samples/frame such that when sps=16000, spf=256 */
01394     r->spf = (r->sps * 256) / CONT_AD_SPS;
01395     r->adbufsize = CONT_AD_ADFRMSIZE * r->spf;
01396 
01397     if ((r->adbuf = malloc(r->adbufsize * sizeof(*r->adbuf))) == NULL) {
01398         E_ERROR_SYSTEM("allocation of audio buffer failed");
01399         free(r);
01400         return NULL;
01401     }
01402     if ((r->pow_hist =
01403          calloc(CONT_AD_POWHISTSIZE, sizeof(*r->pow_hist))) == NULL) {
01404         E_ERROR_SYSTEM("allocation of power history buffer failed");
01405         free(r->adbuf);
01406         free(r);
01407         return NULL;
01408     }
01409     if ((r->frm_pow =
01410          calloc(CONT_AD_ADFRMSIZE, sizeof(*r->frm_pow))) == NULL) {
01411         E_ERROR_SYSTEM("allocation of frame power buffer failed");
01412         free(r->pow_hist);
01413         free(r->adbuf);
01414         free(r);
01415         return NULL;
01416     }
01417 
01418     r->state = CONT_AD_STATE_SIL;
01419     r->read_ts = 0;
01420     r->seglen = 0;
01421     r->siglvl = 0;
01422     r->prev_sample = 0;
01423     r->tot_frm = 0;
01424     r->noise_level = CONT_AD_DEFAULT_NOISE;
01425 
01426     r->auto_thresh = 1;
01427     r->delta_sil = CONT_AD_DELTA_SIL;
01428     r->delta_speech = CONT_AD_DELTA_SPEECH;
01429     r->min_noise = CONT_AD_MIN_NOISE;
01430     r->max_noise = CONT_AD_MAX_NOISE;
01431     r->winsize = CONT_AD_WINSIZE;
01432     r->speech_onset = CONT_AD_SPEECH_ONSET;
01433     r->sil_onset = CONT_AD_SIL_ONSET;
01434     r->leader = CONT_AD_LEADER;
01435     r->trailer = CONT_AD_TRAILER;
01436 
01437     r->thresh_sil = r->noise_level + r->delta_sil;
01438     r->thresh_speech = r->noise_level + r->delta_speech;
01439     r->thresh_update = CONT_AD_THRESH_UPDATE;
01440     r->adapt_rate = CONT_AD_ADAPT_RATE;
01441 
01442     r->tail_state = CONT_AD_STATE_SIL;
01443 
01444     r->spseg_head = NULL;
01445     r->spseg_tail = NULL;
01446 
01447     r->rawfp = NULL;
01448     r->logfp = NULL;
01449 
01450     r->n_calib_frame = 0;
01451 
01452     cont_ad_reset(r);
01453 
01454     return r;
01455 }
01456 
01457 
01458 cont_ad_t *
01459 cont_ad_init_rawmode(ad_rec_t * a,
01460                      int32(*func) (ad_rec_t *, int16 *, int32))
01461 {
01462     cont_ad_t *r;
01463 
01464     r = cont_ad_init(a, func);
01465     r->rawmode = 1;
01466 
01467     return r;
01468 }