SphinxBase 0.6
|
00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ 00002 /* ==================================================================== 00003 * Copyright (c) 1999-2007 Carnegie Mellon University. All rights 00004 * reserved. 00005 * 00006 * Redistribution and use in source and binary forms, with or without 00007 * modification, are permitted provided that the following conditions 00008 * are met: 00009 * 00010 * 1. Redistributions of source code must retain the above copyright 00011 * notice, this list of conditions and the following disclaimer. 00012 * 00013 * 2. Redistributions in binary form must reproduce the above copyright 00014 * notice, this list of conditions and the following disclaimer in 00015 * the documentation and/or other materials provided with the 00016 * distribution. 00017 * 00018 * This work was supported in part by funding from the Defense Advanced 00019 * Research Projects Agency and the National Science Foundation of the 00020 * United States of America, and the CMU Sphinx Speech Consortium. 00021 * 00022 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 00023 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 00024 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 00025 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY 00026 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00027 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 00028 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 00029 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 00030 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 00031 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 00032 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00033 * 00034 * ==================================================================== 00035 * 00036 */ 00037 /* 00038 * \file ngram_model_arpa.c ARPA format language models 00039 * 00040 * Author: David Huggins-Daines <dhuggins@cs.cmu.edu> 00041 */ 00042 00043 #include "sphinxbase/ckd_alloc.h" 00044 #include <string.h> 00045 #include <limits.h> 00046 #include <assert.h> 00047 00048 #include "sphinxbase/err.h" 00049 #include "sphinxbase/pio.h" 00050 #include "sphinxbase/listelem_alloc.h" 00051 #include "sphinxbase/strfuncs.h" 00052 00053 #include "ngram_model_arpa.h" 00054 00055 static ngram_funcs_t ngram_model_arpa_funcs; 00056 00057 #define TSEG_BASE(m,b) ((m)->lm3g.tseg_base[(b)>>LOG_BG_SEG_SZ]) 00058 #define FIRST_BG(m,u) ((m)->lm3g.unigrams[u].bigrams) 00059 #define FIRST_TG(m,b) (TSEG_BASE((m),(b))+((m)->lm3g.bigrams[b].trigrams)) 00060 00061 /* 00062 * Read and return #unigrams, #bigrams, #trigrams as stated in input file. 00063 */ 00064 static int 00065 ReadNgramCounts(lineiter_t **li, int32 * n_ug, int32 * n_bg, int32 * n_tg) 00066 { 00067 int32 ngram, ngram_cnt; 00068 00069 /* skip file until past the '\data\' marker */ 00070 while (*li) { 00071 string_trim((*li)->buf, STRING_BOTH); 00072 if (strcmp((*li)->buf, "\\data\\") == 0) 00073 break; 00074 *li = lineiter_next(*li); 00075 } 00076 if (*li == NULL || strcmp((*li)->buf, "\\data\\") != 0) { 00077 E_INFO("No \\data\\ mark in LM file\n"); 00078 return -1; 00079 } 00080 00081 *n_ug = *n_bg = *n_tg = 0; 00082 while ((*li = lineiter_next(*li))) { 00083 if (sscanf((*li)->buf, "ngram %d=%d", &ngram, &ngram_cnt) != 2) 00084 break; 00085 switch (ngram) { 00086 case 1: 00087 *n_ug = ngram_cnt; 00088 break; 00089 case 2: 00090 *n_bg = ngram_cnt; 00091 break; 00092 case 3: 00093 *n_tg = ngram_cnt; 00094 break; 00095 default: 00096 E_ERROR("Unknown ngram (%d)\n", ngram); 00097 return -1; 00098 } 00099 } 00100 if (*li == NULL) { 00101 E_ERROR("EOF while reading ngram counts\n"); 00102 return -1; 00103 } 00104 00105 /* Position iterator to the unigrams header '\1-grams:\' */ 00106 while ((*li = lineiter_next(*li))) { 00107 string_trim((*li)->buf, STRING_BOTH); 00108 if (strcmp((*li)->buf, "\\1-grams:") == 0) 00109 break; 00110 } 00111 if (*li == NULL) { 00112 E_ERROR_SYSTEM("Failed to read \\1-grams: mark"); 00113 return -1; 00114 } 00115 00116 if ((*n_ug <= 0) || (*n_bg < 0) || (*n_tg < 0)) { 00117 E_ERROR("Bad or missing ngram count\n"); 00118 return -1; 00119 } 00120 return 0; 00121 } 00122 00123 /* 00124 * Read in the unigrams from given file into the LM structure model. 00125 * On entry to this procedure, the iterator is positioned to the 00126 * header line '\1-grams:'. 00127 */ 00128 static int 00129 ReadUnigrams(lineiter_t **li, ngram_model_arpa_t * model) 00130 { 00131 ngram_model_t *base = &model->base; 00132 int32 wcnt; 00133 float p1; 00134 00135 E_INFO("Reading unigrams\n"); 00136 00137 wcnt = 0; 00138 while ((*li = lineiter_next(*li))) { 00139 char *wptr[3], *name; 00140 float32 bo_wt = 0.0f; 00141 int n; 00142 00143 string_trim((*li)->buf, STRING_BOTH); 00144 if (strcmp((*li)->buf, "\\2-grams:") == 0 00145 || strcmp((*li)->buf, "\\end\\") == 0) 00146 break; 00147 00148 if ((n = str2words((*li)->buf, wptr, 3)) < 2) { 00149 if ((*li)->buf[0] != '\0') 00150 E_WARN("Format error; unigram ignored: %s\n", (*li)->buf); 00151 continue; 00152 } 00153 else { 00154 p1 = (float)atof_c(wptr[0]); 00155 name = wptr[1]; 00156 if (n == 3) 00157 bo_wt = (float)atof_c(wptr[2]); 00158 } 00159 00160 if (wcnt >= base->n_counts[0]) { 00161 E_ERROR("Too many unigrams\n"); 00162 return -1; 00163 } 00164 00165 /* Associate name with word id */ 00166 base->word_str[wcnt] = ckd_salloc(name); 00167 if ((hash_table_enter(base->wid, base->word_str[wcnt], (void *)(long)wcnt)) 00168 != (void *)(long)wcnt) { 00169 E_WARN("Duplicate word in dictionary: %s\n", base->word_str[wcnt]); 00170 } 00171 model->lm3g.unigrams[wcnt].prob1.l = logmath_log10_to_log(base->lmath, p1); 00172 model->lm3g.unigrams[wcnt].bo_wt1.l = logmath_log10_to_log(base->lmath, bo_wt); 00173 wcnt++; 00174 } 00175 00176 if (base->n_counts[0] != wcnt) { 00177 E_WARN("lm_t.ucount(%d) != #unigrams read(%d)\n", 00178 base->n_counts[0], wcnt); 00179 base->n_counts[0] = wcnt; 00180 base->n_words = wcnt; 00181 } 00182 return 0; 00183 } 00184 00185 /* 00186 * Read bigrams from given file into given model structure. 00187 */ 00188 static int 00189 ReadBigrams(lineiter_t **li, ngram_model_arpa_t * model) 00190 { 00191 ngram_model_t *base = &model->base; 00192 int32 w1, w2, prev_w1, bgcount; 00193 bigram_t *bgptr; 00194 00195 E_INFO("Reading bigrams\n"); 00196 00197 bgcount = 0; 00198 bgptr = model->lm3g.bigrams; 00199 prev_w1 = -1; 00200 00201 while ((*li = lineiter_next(*li))) { 00202 float32 p, bo_wt = 0.0f; 00203 int32 p2, bo_wt2; 00204 char *wptr[4], *word1, *word2; 00205 int n; 00206 00207 string_trim((*li)->buf, STRING_BOTH); 00208 wptr[3] = NULL; 00209 if ((n = str2words((*li)->buf, wptr, 4)) < 3) { 00210 if ((*li)->buf[0] != '\0') 00211 break; 00212 continue; 00213 } 00214 else { 00215 p = (float32)atof_c(wptr[0]); 00216 word1 = wptr[1]; 00217 word2 = wptr[2]; 00218 if (wptr[3]) 00219 bo_wt = (float32)atof_c(wptr[3]); 00220 } 00221 00222 if ((w1 = ngram_wid(base, word1)) == NGRAM_INVALID_WID) { 00223 E_ERROR("Unknown word: %s, skipping bigram (%s %s)\n", 00224 word1, word1, word2); 00225 continue; 00226 } 00227 if ((w2 = ngram_wid(base, word2)) == NGRAM_INVALID_WID) { 00228 E_ERROR("Unknown word: %s, skipping bigram (%s %s)\n", 00229 word2, word1, word2); 00230 continue; 00231 } 00232 00233 /* FIXME: Should use logmath_t quantization here. */ 00234 /* HACK!! to quantize probs to 4 decimal digits */ 00235 p = (float32)((int32)(p * 10000)) / 10000; 00236 bo_wt = (float32)((int32)(bo_wt * 10000)) / 10000; 00237 00238 p2 = logmath_log10_to_log(base->lmath, p); 00239 bo_wt2 = logmath_log10_to_log(base->lmath, bo_wt); 00240 00241 if (bgcount >= base->n_counts[1]) { 00242 E_ERROR("Too many bigrams\n"); 00243 return -1; 00244 } 00245 00246 bgptr->wid = w2; 00247 bgptr->prob2 = sorted_id(&model->sorted_prob2, &p2); 00248 if (base->n_counts[2] > 0) 00249 bgptr->bo_wt2 = sorted_id(&model->sorted_bo_wt2, &bo_wt2); 00250 00251 if (w1 != prev_w1) { 00252 if (w1 < prev_w1) { 00253 E_ERROR("Bigrams not in unigram order\n"); 00254 return -1; 00255 } 00256 00257 for (prev_w1++; prev_w1 <= w1; prev_w1++) 00258 model->lm3g.unigrams[prev_w1].bigrams = bgcount; 00259 prev_w1 = w1; 00260 } 00261 bgcount++; 00262 bgptr++; 00263 00264 if ((bgcount & 0x0000ffff) == 0) { 00265 E_INFOCONT("."); 00266 } 00267 } 00268 if (*li == NULL || ((strcmp((*li)->buf, "\\end\\") != 0) 00269 && (strcmp((*li)->buf, "\\3-grams:") != 0))) { 00270 E_ERROR("Bad bigram: %s\n", (*li)->buf); 00271 return -1; 00272 } 00273 00274 for (prev_w1++; prev_w1 <= base->n_counts[0]; prev_w1++) 00275 model->lm3g.unigrams[prev_w1].bigrams = bgcount; 00276 00277 return 0; 00278 } 00279 00280 /* 00281 * Very similar to ReadBigrams. 00282 */ 00283 static int 00284 ReadTrigrams(lineiter_t **li, ngram_model_arpa_t * model) 00285 { 00286 ngram_model_t *base = &model->base; 00287 int32 i, w1, w2, w3, prev_w1, prev_w2, tgcount, prev_bg, bg, endbg; 00288 int32 seg, prev_seg, prev_seg_lastbg; 00289 trigram_t *tgptr; 00290 bigram_t *bgptr; 00291 00292 E_INFO("Reading trigrams\n"); 00293 00294 tgcount = 0; 00295 tgptr = model->lm3g.trigrams; 00296 prev_w1 = -1; 00297 prev_w2 = -1; 00298 prev_bg = -1; 00299 prev_seg = -1; 00300 00301 while ((*li = lineiter_next(*li))) { 00302 float32 p; 00303 int32 p3; 00304 char *wptr[4], *word1, *word2, *word3; 00305 00306 string_trim((*li)->buf, STRING_BOTH); 00307 if (str2words((*li)->buf, wptr, 4) != 4) { 00308 if ((*li)->buf[0] != '\0') 00309 break; 00310 continue; 00311 } 00312 else { 00313 p = (float32)atof_c(wptr[0]); 00314 word1 = wptr[1]; 00315 word2 = wptr[2]; 00316 word3 = wptr[3]; 00317 } 00318 00319 if ((w1 = ngram_wid(base, word1)) == NGRAM_INVALID_WID) { 00320 E_ERROR("Unknown word: %s, skipping trigram (%s %s %s)\n", 00321 word1, word1, word2, word3); 00322 continue; 00323 } 00324 if ((w2 = ngram_wid(base, word2)) == NGRAM_INVALID_WID) { 00325 E_ERROR("Unknown word: %s, skipping trigram (%s %s %s)\n", 00326 word2, word1, word2, word3); 00327 continue; 00328 } 00329 if ((w3 = ngram_wid(base, word3)) == NGRAM_INVALID_WID) { 00330 E_ERROR("Unknown word: %s, skipping trigram (%s %s %s)\n", 00331 word3, word1, word2, word3); 00332 continue; 00333 } 00334 00335 /* FIXME: Should use logmath_t quantization here. */ 00336 /* HACK!! to quantize probs to 4 decimal digits */ 00337 p = (float32)((int32)(p * 10000)) / 10000; 00338 p3 = logmath_log10_to_log(base->lmath, p); 00339 00340 if (tgcount >= base->n_counts[2]) { 00341 E_ERROR("Too many trigrams\n"); 00342 return -1; 00343 } 00344 00345 tgptr->wid = w3; 00346 tgptr->prob3 = sorted_id(&model->sorted_prob3, &p3); 00347 00348 if ((w1 != prev_w1) || (w2 != prev_w2)) { 00349 /* Trigram for a new bigram; update tg info for all previous bigrams */ 00350 if ((w1 < prev_w1) || ((w1 == prev_w1) && (w2 < prev_w2))) { 00351 E_ERROR("Trigrams not in bigram order\n"); 00352 return -1; 00353 } 00354 00355 bg = (w1 != 00356 prev_w1) ? model->lm3g.unigrams[w1].bigrams : prev_bg + 1; 00357 endbg = model->lm3g.unigrams[w1 + 1].bigrams; 00358 bgptr = model->lm3g.bigrams + bg; 00359 for (; (bg < endbg) && (bgptr->wid != w2); bg++, bgptr++); 00360 if (bg >= endbg) { 00361 E_ERROR("Missing bigram for trigram: %s", (*li)->buf); 00362 return -1; 00363 } 00364 00365 /* bg = bigram entry index for <w1,w2>. Update tseg_base */ 00366 seg = bg >> LOG_BG_SEG_SZ; 00367 for (i = prev_seg + 1; i <= seg; i++) 00368 model->lm3g.tseg_base[i] = tgcount; 00369 00370 /* Update trigrams pointers for all bigrams until bg */ 00371 if (prev_seg < seg) { 00372 int32 tgoff = 0; 00373 00374 if (prev_seg >= 0) { 00375 tgoff = tgcount - model->lm3g.tseg_base[prev_seg]; 00376 if (tgoff > 65535) { 00377 E_ERROR("Offset from tseg_base > 65535\n"); 00378 return -1; 00379 } 00380 } 00381 00382 prev_seg_lastbg = ((prev_seg + 1) << LOG_BG_SEG_SZ) - 1; 00383 bgptr = model->lm3g.bigrams + prev_bg; 00384 for (++prev_bg, ++bgptr; prev_bg <= prev_seg_lastbg; 00385 prev_bg++, bgptr++) 00386 bgptr->trigrams = tgoff; 00387 00388 for (; prev_bg <= bg; prev_bg++, bgptr++) 00389 bgptr->trigrams = 0; 00390 } 00391 else { 00392 int32 tgoff; 00393 00394 tgoff = tgcount - model->lm3g.tseg_base[prev_seg]; 00395 if (tgoff > 65535) { 00396 E_ERROR("Offset from tseg_base > 65535\n"); 00397 return -1; 00398 } 00399 00400 bgptr = model->lm3g.bigrams + prev_bg; 00401 for (++prev_bg, ++bgptr; prev_bg <= bg; prev_bg++, bgptr++) 00402 bgptr->trigrams = tgoff; 00403 } 00404 00405 prev_w1 = w1; 00406 prev_w2 = w2; 00407 prev_bg = bg; 00408 prev_seg = seg; 00409 } 00410 00411 tgcount++; 00412 tgptr++; 00413 00414 if ((tgcount & 0x0000ffff) == 0) { 00415 E_INFOCONT("."); 00416 } 00417 } 00418 if (*li == NULL || strcmp((*li)->buf, "\\end\\") != 0) { 00419 E_ERROR("Bad trigram: %s\n", (*li)->buf); 00420 return -1; 00421 } 00422 00423 for (prev_bg++; prev_bg <= base->n_counts[1]; prev_bg++) { 00424 if ((prev_bg & (BG_SEG_SZ - 1)) == 0) 00425 model->lm3g.tseg_base[prev_bg >> LOG_BG_SEG_SZ] = tgcount; 00426 if ((tgcount - model->lm3g.tseg_base[prev_bg >> LOG_BG_SEG_SZ]) > 65535) { 00427 E_ERROR("Offset from tseg_base > 65535\n"); 00428 return -1; 00429 } 00430 model->lm3g.bigrams[prev_bg].trigrams = 00431 tgcount - model->lm3g.tseg_base[prev_bg >> LOG_BG_SEG_SZ]; 00432 } 00433 return 0; 00434 } 00435 00436 static unigram_t * 00437 new_unigram_table(int32 n_ug) 00438 { 00439 unigram_t *table; 00440 int32 i; 00441 00442 table = ckd_calloc(n_ug, sizeof(unigram_t)); 00443 for (i = 0; i < n_ug; i++) { 00444 table[i].prob1.l = INT_MIN; 00445 table[i].bo_wt1.l = INT_MIN; 00446 } 00447 return table; 00448 } 00449 00450 ngram_model_t * 00451 ngram_model_arpa_read(cmd_ln_t *config, 00452 const char *file_name, 00453 logmath_t *lmath) 00454 { 00455 lineiter_t *li; 00456 FILE *fp; 00457 int32 is_pipe; 00458 int32 n_unigram; 00459 int32 n_bigram; 00460 int32 n_trigram; 00461 int32 n; 00462 ngram_model_arpa_t *model; 00463 ngram_model_t *base; 00464 00465 if ((fp = fopen_comp(file_name, "r", &is_pipe)) == NULL) { 00466 E_ERROR("File %s not found\n", file_name); 00467 return NULL; 00468 } 00469 li = lineiter_start(fp); 00470 00471 /* Read #unigrams, #bigrams, #trigrams from file */ 00472 if (ReadNgramCounts(&li, &n_unigram, &n_bigram, &n_trigram) == -1) { 00473 lineiter_free(li); 00474 fclose_comp(fp, is_pipe); 00475 return NULL; 00476 } 00477 E_INFO("ngrams 1=%d, 2=%d, 3=%d\n", n_unigram, n_bigram, n_trigram); 00478 00479 /* Allocate space for LM, including initial OOVs and placeholders; initialize it */ 00480 model = ckd_calloc(1, sizeof(*model)); 00481 base = &model->base; 00482 if (n_trigram > 0) 00483 n = 3; 00484 else if (n_bigram > 0) 00485 n = 2; 00486 else 00487 n = 1; 00488 /* Initialize base model. */ 00489 ngram_model_init(base, &ngram_model_arpa_funcs, lmath, n, n_unigram); 00490 base->n_counts[0] = n_unigram; 00491 base->n_counts[1] = n_bigram; 00492 base->n_counts[2] = n_trigram; 00493 base->writable = TRUE; 00494 00495 /* 00496 * Allocate one extra unigram and bigram entry: sentinels to terminate 00497 * followers (bigrams and trigrams, respectively) of previous entry. 00498 */ 00499 model->lm3g.unigrams = new_unigram_table(n_unigram + 1); 00500 model->lm3g.bigrams = 00501 ckd_calloc(n_bigram + 1, sizeof(bigram_t)); 00502 if (n_trigram > 0) 00503 model->lm3g.trigrams = 00504 ckd_calloc(n_trigram, sizeof(trigram_t)); 00505 00506 if (n_trigram > 0) { 00507 model->lm3g.tseg_base = 00508 ckd_calloc((n_bigram + 1) / BG_SEG_SZ + 1, 00509 sizeof(int32)); 00510 } 00511 if (ReadUnigrams(&li, model) == -1) { 00512 fclose_comp(fp, is_pipe); 00513 ngram_model_free(base); 00514 return NULL; 00515 } 00516 E_INFO("%8d = #unigrams created\n", base->n_counts[0]); 00517 00518 init_sorted_list(&model->sorted_prob2); 00519 if (base->n_counts[2] > 0) 00520 init_sorted_list(&model->sorted_bo_wt2); 00521 00522 if (base->n_counts[1] > 0) { 00523 if (ReadBigrams(&li, model) == -1) { 00524 fclose_comp(fp, is_pipe); 00525 ngram_model_free(base); 00526 return NULL; 00527 } 00528 00529 base->n_counts[1] = FIRST_BG(model, base->n_counts[0]); 00530 model->lm3g.n_prob2 = model->sorted_prob2.free; 00531 model->lm3g.prob2 = vals_in_sorted_list(&model->sorted_prob2); 00532 free_sorted_list(&model->sorted_prob2); 00533 E_INFO("%8d = #bigrams created\n", base->n_counts[1]); 00534 E_INFO("%8d = #prob2 entries\n", model->lm3g.n_prob2); 00535 } 00536 00537 if (base->n_counts[2] > 0) { 00538 /* Create trigram bo-wts array */ 00539 model->lm3g.n_bo_wt2 = model->sorted_bo_wt2.free; 00540 model->lm3g.bo_wt2 = vals_in_sorted_list(&model->sorted_bo_wt2); 00541 free_sorted_list(&model->sorted_bo_wt2); 00542 E_INFO("%8d = #bo_wt2 entries\n", model->lm3g.n_bo_wt2); 00543 00544 init_sorted_list(&model->sorted_prob3); 00545 00546 if (ReadTrigrams(&li, model) == -1) { 00547 fclose_comp(fp, is_pipe); 00548 ngram_model_free(base); 00549 return NULL; 00550 } 00551 00552 base->n_counts[2] = FIRST_TG(model, base->n_counts[1]); 00553 model->lm3g.n_prob3 = model->sorted_prob3.free; 00554 model->lm3g.prob3 = vals_in_sorted_list(&model->sorted_prob3); 00555 E_INFO("%8d = #trigrams created\n", base->n_counts[2]); 00556 E_INFO("%8d = #prob3 entries\n", model->lm3g.n_prob3); 00557 00558 free_sorted_list(&model->sorted_prob3); 00559 00560 /* Initialize tginfo */ 00561 model->lm3g.tginfo = ckd_calloc(n_unigram, sizeof(tginfo_t *)); 00562 model->lm3g.le = listelem_alloc_init(sizeof(tginfo_t)); 00563 } 00564 00565 lineiter_free(li); 00566 fclose_comp(fp, is_pipe); 00567 return base; 00568 } 00569 00570 int 00571 ngram_model_arpa_write(ngram_model_t *model, 00572 const char *file_name) 00573 { 00574 ngram_iter_t *itor; 00575 FILE *fh; 00576 int i; 00577 00578 if ((fh = fopen(file_name, "w")) == NULL) { 00579 E_ERROR_SYSTEM("Failed to open %s for writing", file_name); 00580 return -1; 00581 } 00582 fprintf(fh, "This is an ARPA-format language model file, generated by CMU Sphinx\n"); 00583 00584 /* The ARPA format doesn't require any extra information that 00585 * N-Gram iterators can't give us, so this is very 00586 * straightforward compared with DMP writing. */ 00587 00588 /* Write N-gram counts. */ 00589 fprintf(fh, "\\data\\\n"); 00590 for (i = 0; i < model->n; ++i) { 00591 fprintf(fh, "ngram %d=%d\n", i+1, model->n_counts[i]); 00592 } 00593 00594 /* Write N-grams */ 00595 for (i = 0; i < model->n; ++i) { 00596 fprintf(fh, "\n\\%d-grams:\n", i + 1); 00597 for (itor = ngram_model_mgrams(model, i); itor; itor = ngram_iter_next(itor)) { 00598 int32 const *wids; 00599 int32 score, bowt; 00600 int j; 00601 00602 wids = ngram_iter_get(itor, &score, &bowt); 00603 fprintf(fh, "%.4f ", logmath_log_to_log10(model->lmath, score)); 00604 for (j = 0; j <= i; ++j) { 00605 assert(wids[j] < model->n_counts[0]); 00606 fprintf(fh, "%s ", model->word_str[wids[j]]); 00607 } 00608 if (i < model->n-1) 00609 fprintf(fh, "%.4f", logmath_log_to_log10(model->lmath, bowt)); 00610 fprintf(fh, "\n"); 00611 } 00612 } 00613 fprintf(fh, "\n\\end\\\n"); 00614 return fclose(fh); 00615 } 00616 00617 static int 00618 ngram_model_arpa_apply_weights(ngram_model_t *base, float32 lw, 00619 float32 wip, float32 uw) 00620 { 00621 ngram_model_arpa_t *model = (ngram_model_arpa_t *)base; 00622 lm3g_apply_weights(base, &model->lm3g, lw, wip, uw); 00623 return 0; 00624 } 00625 00626 /* Lousy "templating" for things that are largely the same in DMP and 00627 * ARPA models, except for the bigram and trigram types and some 00628 * names. */ 00629 #define NGRAM_MODEL_TYPE ngram_model_arpa_t 00630 #include "lm3g_templates.c" 00631 00632 static void 00633 ngram_model_arpa_free(ngram_model_t *base) 00634 { 00635 ngram_model_arpa_t *model = (ngram_model_arpa_t *)base; 00636 ckd_free(model->lm3g.unigrams); 00637 ckd_free(model->lm3g.bigrams); 00638 ckd_free(model->lm3g.trigrams); 00639 ckd_free(model->lm3g.prob2); 00640 ckd_free(model->lm3g.bo_wt2); 00641 ckd_free(model->lm3g.prob3); 00642 lm3g_tginfo_free(base, &model->lm3g); 00643 ckd_free(model->lm3g.tseg_base); 00644 } 00645 00646 static ngram_funcs_t ngram_model_arpa_funcs = { 00647 ngram_model_arpa_free, /* free */ 00648 ngram_model_arpa_apply_weights, /* apply_weights */ 00649 lm3g_template_score, /* score */ 00650 lm3g_template_raw_score, /* raw_score */ 00651 lm3g_template_add_ug, /* add_ug */ 00652 lm3g_template_flush, /* flush */ 00653 lm3g_template_iter, /* iter */ 00654 lm3g_template_mgrams, /* mgrams */ 00655 lm3g_template_successors, /* successors */ 00656 lm3g_template_iter_get, /* iter_get */ 00657 lm3g_template_iter_next, /* iter_next */ 00658 lm3g_template_iter_free /* iter_free */ 00659 };