44#include <sphinxbase/byteorder.h>
46#include "ngram_model_internal.h"
47#include "ngrams_raw.h"
50ngram_ord_comparator(
const void *a_raw,
const void *b_raw)
56 while (a_w_ptr < a->order && b_w_ptr < b->order) {
57 if (a->words[a_w_ptr] == b->words[b_w_ptr]) {
62 if (a->words[a_w_ptr] < b->words[b_w_ptr])
67 return a->order - b->order;
72 logmath_t *lmath,
int order,
int order_max,
77 char *wptr[NGRAM_MAX_ORDER + 1];
80 words_expected = order + 1;
83 NGRAM_MAX_ORDER + 1)) < words_expected) {
84 E_ERROR(
"Format error; %d-gram ignored at line %d\n", order, li->lineno);
88 raw_ngram->order = order;
90 if (order == order_max) {
91 raw_ngram->prob =
atof_c(wptr[0]);
92 if (raw_ngram->prob > 0) {
93 E_WARN(
"%d-gram '%s' has positive probability\n", order, wptr[1]);
94 raw_ngram->prob = 0.0f;
100 float weight, backoff;
104 E_WARN(
"%d-gram '%s' has positive probability\n", order, wptr[1]);
105 raw_ngram->prob = 0.0f;
112 if (n == order + 1) {
113 raw_ngram->backoff = 0.0f;
116 backoff =
atof_c(wptr[order + 1]);
122 (uint32 *)
ckd_calloc(order,
sizeof(*raw_ngram->words));
123 for (word_out = raw_ngram->words + order - 1, i = 1;
124 word_out >= raw_ngram->words; --word_out, i++) {
133 int order,
int order_max)
135 char expected_header[20];
138 sprintf(expected_header,
"\\%d-grams:", order);
139 while (*li && strcmp((*li)->buf, expected_header) != 0) {
144 E_ERROR(
"Failed to find '%s', language model file truncated\n", expected_header);
149 for (i = 0, cur = 0; i < *count && *li != NULL; i++) {
152 E_ERROR(
"Unexpected end of ARPA file. Failed to read %d-gram\n",
156 if (ngrams_raw_read_line(*li, wid, lmath, order, order_max,
157 *raw_ngrams + cur) == 0) {
162 qsort(*raw_ngrams, *count,
sizeof(
ngram_raw_t), &ngram_ord_comparator);
176 for (order_it = 2; order_it <= order; order_it++) {
177 if (ngrams_raw_read_section(&raw_ngrams[order_it - 2], li, wid, lmath,
178 counts + order_it - 1, order_it, order) < 0)
184 E_ERROR(
"ARPA file ends without end-mark\n");
185 ngrams_raw_free(raw_ngrams, counts, order);
189 if (strcmp((*li)->buf,
"\\end\\") != 0) {
191 (
"Finished reading ARPA file. Expecting end mark but found '%s'\n",
200read_dmp_weight_array(FILE * fp,
logmath_t * lmath, uint8 do_swap,
207 fread(&k,
sizeof(k), 1, fp);
212 fread(tmp_weight_arr,
sizeof(*tmp_weight_arr), k, fp);
213 for (i = 0; i < k; i++) {
215 SWAP_INT32(&tmp_weight_arr[i].l);
217 tmp_weight_arr[i].f =
221 for (i = 0; i < counts; i++) {
222 if (weight_idx == 0) {
224 tmp_weight_arr[(int) raw_ngrams[i].prob].f;
226 raw_ngrams[i].backoff =
227 tmp_weight_arr[(int) raw_ngrams[i].backoff].f;
233#define BIGRAM_SEGMENT_SIZE 9
236ngrams_raw_read_dmp(FILE * fp,
logmath_t * lmath, uint32 * counts,
237 int order, uint32 * unigram_next, uint8 do_swap)
240 uint16 *bigrams_next;
247 sizeof(*raw_ngrams[0]));
249 (uint16 *)
ckd_calloc((
size_t) (counts[1] + 1),
250 sizeof(*bigrams_next));
252 for (j = 0; j <= (int32) counts[1]; j++) {
253 uint16 wid, prob_idx, bo_idx;
256 fread(&wid,
sizeof(wid), 1, fp);
259 raw_ngram->order = 2;
260 while (ngram_idx < counts[0] && j == unigram_next[ngram_idx]) {
264 if (j != counts[1]) {
266 (uint32 *)
ckd_calloc(2,
sizeof(*raw_ngram->words));
267 raw_ngram->words[0] = (uint32) wid;
268 raw_ngram->words[1] = (uint32) ngram_idx - 1;
271 fread(&prob_idx,
sizeof(prob_idx), 1, fp);
272 fread(&bo_idx,
sizeof(bo_idx), 1, fp);
273 fread(&bigrams_next[j],
sizeof(bigrams_next[j]), 1, fp);
275 SWAP_INT16(&prob_idx);
277 SWAP_INT16(&bigrams_next[j]);
280 if (j != counts[1]) {
281 raw_ngram->prob = prob_idx + 0.5f;
282 raw_ngram->backoff = bo_idx + 0.5f;
286 if (ngram_idx < counts[0]) {
287 E_ERROR(
"Corrupted model, not enough unigrams %d %d\n", ngram_idx, counts[0]);
289 ngrams_raw_free(raw_ngrams, counts, order);
297 sizeof(*raw_ngrams[1]));
298 for (j = 0; j < (int32) counts[2]; j++) {
299 uint16 wid, prob_idx;
302 fread(&wid,
sizeof(wid), 1, fp);
303 fread(&prob_idx,
sizeof(prob_idx), 1, fp);
306 SWAP_INT16(&prob_idx);
309 raw_ngram->order = 3;
311 (uint32 *)
ckd_calloc(3,
sizeof(*raw_ngram->words));
312 raw_ngram->words[0] = (uint32) wid;
313 raw_ngram->prob = prob_idx + 0.5f;
318 read_dmp_weight_array(fp, lmath, do_swap, (int32) counts[1],
324 read_dmp_weight_array(fp, lmath, do_swap, (int32) counts[1],
327 read_dmp_weight_array(fp, lmath, do_swap, (int32) counts[2],
330 fread(&k,
sizeof(k), 1, fp);
333 tseg_base = (int32 *)
ckd_calloc(k,
sizeof(int32));
334 fread(tseg_base,
sizeof(int32), k, fp);
336 for (j = 0; j < (uint32) k; j++) {
337 SWAP_INT32(&tseg_base[j]);
341 for (j = 1; j <= counts[1]; j++) {
342 uint32 next_ngram_idx =
343 (uint32) (tseg_base[j >> BIGRAM_SEGMENT_SIZE] +
345 while (ngram_idx < next_ngram_idx) {
346 raw_ngrams[1][ngram_idx].words[1] =
347 raw_ngrams[0][j - 1].words[0];
348 raw_ngrams[1][ngram_idx].words[2] =
349 raw_ngrams[0][j - 1].words[1];
355 if (ngram_idx < counts[2]) {
356 E_ERROR(
"Corrupted model, some trigrams have no corresponding bigram\n");
358 ngrams_raw_free(raw_ngrams, counts, order);
365 qsort(raw_ngrams[0], (
size_t) counts[1],
sizeof(*raw_ngrams[0]),
366 &ngram_ord_comparator);
368 qsort(raw_ngrams[1], (
size_t) counts[2],
sizeof(*raw_ngrams[1]),
369 &ngram_ord_comparator);
375ngrams_raw_free(
ngram_raw_t ** raw_ngrams, uint32 * counts,
int order)
380 for (order_it = 0; order_it < order - 1; order_it++) {
381 for (num = 0; num < counts[order_it + 1]; num++) {
382 ckd_free(raw_ngrams[order_it][num].words);
Sphinx's memory allocation/deallocation routines.
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
Implementation of logging routines.
#define E_ERROR(...)
Print error message to error log.
#define E_WARN(...)
Print warning message to error log.
SPHINXBASE_EXPORT int32 hash_table_lookup_int32(hash_table_t *h, const char *key, int32 *val)
Look up a 32-bit integer value in a hash table.
SPHINXBASE_EXPORT float logmath_log10_to_log_float(logmath_t *lmath, float64 log_p)
Convert base 10 log (in floating point) to float log in base B.
file IO related operations.
SPHINXBASE_EXPORT lineiter_t * lineiter_next(lineiter_t *li)
Move to the next line in the file.
Miscellaneous useful string functions.
SPHINXBASE_EXPORT int32 str2words(char *line, char **wptr, int32 n_wptr)
Convert a line to an array of "words", based on whitespace separators.
SPHINXBASE_EXPORT double atof_c(char const *str)
Locale independent version of atof().