SphinxBase  5prealpha
ngrams_raw.c
1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3  * Copyright (c) 2015 Carnegie Mellon University. All rights
4  * reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  * notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in
15  * the documentation and/or other materials provided with the
16  * distribution.
17  *
18  * This work was supported in part by funding from the Defense Advanced
19  * Research Projects Agency and the National Science Foundation of the
20  * United States of America, and the CMU Sphinx Speech Consortium.
21  *
22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  *
34  * ====================================================================
35  *
36  */
37 
38 #include <string.h>
39 
40 #include <sphinxbase/err.h>
41 #include <sphinxbase/pio.h>
42 #include <sphinxbase/strfuncs.h>
43 #include <sphinxbase/ckd_alloc.h>
44 #include <sphinxbase/byteorder.h>
45 
46 #include "ngram_model_internal.h"
47 #include "ngrams_raw.h"
48 
49 int
50 ngram_ord_comparator(const void *a_raw, const void *b_raw)
51 {
52  ngram_raw_t *a = (ngram_raw_t *) a_raw;
53  ngram_raw_t *b = (ngram_raw_t *) b_raw;
54  int a_w_ptr = 0;
55  int b_w_ptr = 0;
56  while (a_w_ptr < a->order && b_w_ptr < b->order) {
57  if (a->words[a_w_ptr] == b->words[b_w_ptr]) {
58  a_w_ptr++;
59  b_w_ptr++;
60  continue;
61  }
62  if (a->words[a_w_ptr] < b->words[b_w_ptr])
63  return -1;
64  else
65  return 1;
66  }
67  return a->order - b->order;
68 }
69 
70 static int
71 ngrams_raw_read_line(lineiter_t *li, hash_table_t *wid,
72  logmath_t *lmath, int order, int order_max,
73  ngram_raw_t *raw_ngram)
74 {
75  int n, i;
76  int words_expected;
77  char *wptr[NGRAM_MAX_ORDER + 1];
78  uint32 *word_out;
79 
80  words_expected = order + 1;
81  if ((n =
82  str2words(li->buf, wptr,
83  NGRAM_MAX_ORDER + 1)) < words_expected) {
84  E_ERROR("Format error; %d-gram ignored at line %d\n", order, li->lineno);
85  return -1;
86  }
87 
88  raw_ngram->order = order;
89 
90  if (order == order_max) {
91  raw_ngram->prob = atof_c(wptr[0]);
92  if (raw_ngram->prob > 0) {
93  E_WARN("%d-gram '%s' has positive probability\n", order, wptr[1]);
94  raw_ngram->prob = 0.0f;
95  }
96  raw_ngram->prob =
97  logmath_log10_to_log_float(lmath, raw_ngram->prob);
98  }
99  else {
100  float weight, backoff;
101 
102  weight = atof_c(wptr[0]);
103  if (weight > 0) {
104  E_WARN("%d-gram '%s' has positive probability\n", order, wptr[1]);
105  raw_ngram->prob = 0.0f;
106  }
107  else {
108  raw_ngram->prob =
109  logmath_log10_to_log_float(lmath, weight);
110  }
111 
112  if (n == order + 1) {
113  raw_ngram->backoff = 0.0f;
114  }
115  else {
116  backoff = atof_c(wptr[order + 1]);
117  raw_ngram->backoff =
118  logmath_log10_to_log_float(lmath, backoff);
119  }
120  }
121  raw_ngram->words =
122  (uint32 *) ckd_calloc(order, sizeof(*raw_ngram->words));
123  for (word_out = raw_ngram->words + order - 1, i = 1;
124  word_out >= raw_ngram->words; --word_out, i++) {
125  hash_table_lookup_int32(wid, wptr[i], (int32 *) word_out);
126  }
127  return 0;
128 }
129 
130 static int
131 ngrams_raw_read_section(ngram_raw_t ** raw_ngrams, lineiter_t ** li,
132  hash_table_t * wid, logmath_t * lmath, uint32 *count,
133  int order, int order_max)
134 {
135  char expected_header[20];
136  uint32 i, cur;
137 
138  sprintf(expected_header, "\\%d-grams:", order);
139  while (*li && strcmp((*li)->buf, expected_header) != 0) {
140  *li = lineiter_next(*li);
141  }
142 
143  if (*li == NULL) {
144  E_ERROR("Failed to find '%s', language model file truncated\n", expected_header);
145  return -1;
146  }
147 
148  *raw_ngrams = (ngram_raw_t *) ckd_calloc(*count, sizeof(ngram_raw_t));
149  for (i = 0, cur = 0; i < *count && *li != NULL; i++) {
150  *li = lineiter_next(*li);
151  if (*li == NULL) {
152  E_ERROR("Unexpected end of ARPA file. Failed to read %d-gram\n",
153  order);
154  return -1;
155  }
156  if (ngrams_raw_read_line(*li, wid, lmath, order, order_max,
157  *raw_ngrams + cur) == 0) {
158  cur++;
159  }
160  }
161  *count = cur;
162  qsort(*raw_ngrams, *count, sizeof(ngram_raw_t), &ngram_ord_comparator);
163  return 0;
164 }
165 
166 ngram_raw_t **
167 ngrams_raw_read_arpa(lineiter_t ** li, logmath_t * lmath, uint32 * counts,
168  int order, hash_table_t * wid)
169 {
170  ngram_raw_t **raw_ngrams;
171  int order_it;
172 
173  raw_ngrams =
174  (ngram_raw_t **) ckd_calloc(order - 1, sizeof(*raw_ngrams));
175 
176  for (order_it = 2; order_it <= order; order_it++) {
177  if (ngrams_raw_read_section(&raw_ngrams[order_it - 2], li, wid, lmath,
178  counts + order_it - 1, order_it, order) < 0)
179  break;
180  }
181 
182  /* Check if we found ARPA end-mark */
183  if (*li == NULL) {
184  E_ERROR("ARPA file ends without end-mark\n");
185  ngrams_raw_free(raw_ngrams, counts, order);
186  return NULL;
187  } else {
188  *li = lineiter_next(*li);
189  if (strcmp((*li)->buf, "\\end\\") != 0) {
190  E_WARN
191  ("Finished reading ARPA file. Expecting end mark but found '%s'\n",
192  (*li)->buf);
193  }
194  }
195 
196  return raw_ngrams;
197 }
198 
199 static void
200 read_dmp_weight_array(FILE * fp, logmath_t * lmath, uint8 do_swap,
201  int32 counts, ngram_raw_t * raw_ngrams,
202  int weight_idx)
203 {
204  int32 i, k;
205  dmp_weight_t *tmp_weight_arr;
206 
207  fread(&k, sizeof(k), 1, fp);
208  if (do_swap)
209  SWAP_INT32(&k);
210  tmp_weight_arr =
211  (dmp_weight_t *) ckd_calloc(k, sizeof(*tmp_weight_arr));
212  fread(tmp_weight_arr, sizeof(*tmp_weight_arr), k, fp);
213  for (i = 0; i < k; i++) {
214  if (do_swap)
215  SWAP_INT32(&tmp_weight_arr[i].l);
216  /* Convert values to log. */
217  tmp_weight_arr[i].f =
218  logmath_log10_to_log_float(lmath, tmp_weight_arr[i].f);
219  }
220  /* replace indexes with real probs in raw bigrams */
221  for (i = 0; i < counts; i++) {
222  if (weight_idx == 0) {
223  raw_ngrams[i].prob =
224  tmp_weight_arr[(int) raw_ngrams[i].prob].f;
225  } else {
226  raw_ngrams[i].backoff =
227  tmp_weight_arr[(int) raw_ngrams[i].backoff].f;
228  }
229  }
230  ckd_free(tmp_weight_arr);
231 }
232 
233 #define BIGRAM_SEGMENT_SIZE 9
234 
235 ngram_raw_t **
236 ngrams_raw_read_dmp(FILE * fp, logmath_t * lmath, uint32 * counts,
237  int order, uint32 * unigram_next, uint8 do_swap)
238 {
239  uint32 j, ngram_idx;
240  uint16 *bigrams_next;
241  ngram_raw_t **raw_ngrams =
242  (ngram_raw_t **) ckd_calloc(order - 1, sizeof(*raw_ngrams));
243 
244  /* read bigrams */
245  raw_ngrams[0] =
246  (ngram_raw_t *) ckd_calloc((size_t) (counts[1] + 1),
247  sizeof(*raw_ngrams[0]));
248  bigrams_next =
249  (uint16 *) ckd_calloc((size_t) (counts[1] + 1),
250  sizeof(*bigrams_next));
251  ngram_idx = 1;
252  for (j = 0; j <= (int32) counts[1]; j++) {
253  uint16 wid, prob_idx, bo_idx;
254  ngram_raw_t *raw_ngram = &raw_ngrams[0][j];
255 
256  fread(&wid, sizeof(wid), 1, fp);
257  if (do_swap)
258  SWAP_INT16(&wid);
259  raw_ngram->order = 2;
260  while (ngram_idx < counts[0] && j == unigram_next[ngram_idx]) {
261  ngram_idx++;
262  }
263 
264  if (j != counts[1]) {
265  raw_ngram->words =
266  (uint32 *) ckd_calloc(2, sizeof(*raw_ngram->words));
267  raw_ngram->words[0] = (uint32) wid;
268  raw_ngram->words[1] = (uint32) ngram_idx - 1;
269  }
270 
271  fread(&prob_idx, sizeof(prob_idx), 1, fp);
272  fread(&bo_idx, sizeof(bo_idx), 1, fp);
273  fread(&bigrams_next[j], sizeof(bigrams_next[j]), 1, fp);
274  if (do_swap) {
275  SWAP_INT16(&prob_idx);
276  SWAP_INT16(&bo_idx);
277  SWAP_INT16(&bigrams_next[j]);
278  }
279 
280  if (j != counts[1]) {
281  raw_ngram->prob = prob_idx + 0.5f; /* keep index in float. ugly but avoiding using extra memory */
282  raw_ngram->backoff = bo_idx + 0.5f;
283  }
284  }
285 
286  if (ngram_idx < counts[0]) {
287  E_ERROR("Corrupted model, not enough unigrams %d %d\n", ngram_idx, counts[0]);
288  ckd_free(bigrams_next);
289  ngrams_raw_free(raw_ngrams, counts, order);
290  return NULL;
291  }
292 
293  /* read trigrams */
294  if (order > 2) {
295  raw_ngrams[1] =
296  (ngram_raw_t *) ckd_calloc((size_t) counts[2],
297  sizeof(*raw_ngrams[1]));
298  for (j = 0; j < (int32) counts[2]; j++) {
299  uint16 wid, prob_idx;
300  ngram_raw_t *raw_ngram = &raw_ngrams[1][j];
301 
302  fread(&wid, sizeof(wid), 1, fp);
303  fread(&prob_idx, sizeof(prob_idx), 1, fp);
304  if (do_swap) {
305  SWAP_INT16(&wid);
306  SWAP_INT16(&prob_idx);
307  }
308 
309  raw_ngram->order = 3;
310  raw_ngram->words =
311  (uint32 *) ckd_calloc(3, sizeof(*raw_ngram->words));
312  raw_ngram->words[0] = (uint32) wid;
313  raw_ngram->prob = prob_idx + 0.5f; /* keep index in float. ugly but avoiding using extra memory */
314  }
315  }
316 
317  /* read prob2 */
318  read_dmp_weight_array(fp, lmath, do_swap, (int32) counts[1],
319  raw_ngrams[0], 0);
320  /* read bo2 */
321  if (order > 2) {
322  int32 k;
323  int32 *tseg_base;
324  read_dmp_weight_array(fp, lmath, do_swap, (int32) counts[1],
325  raw_ngrams[0], 1);
326  /* read prob3 */
327  read_dmp_weight_array(fp, lmath, do_swap, (int32) counts[2],
328  raw_ngrams[1], 0);
329  /* Read tseg_base size and tseg_base to fill trigram's first words */
330  fread(&k, sizeof(k), 1, fp);
331  if (do_swap)
332  SWAP_INT32(&k);
333  tseg_base = (int32 *) ckd_calloc(k, sizeof(int32));
334  fread(tseg_base, sizeof(int32), k, fp);
335  if (do_swap) {
336  for (j = 0; j < (uint32) k; j++) {
337  SWAP_INT32(&tseg_base[j]);
338  }
339  }
340  ngram_idx = 0;
341  for (j = 1; j <= counts[1]; j++) {
342  uint32 next_ngram_idx =
343  (uint32) (tseg_base[j >> BIGRAM_SEGMENT_SIZE] +
344  bigrams_next[j]);
345  while (ngram_idx < next_ngram_idx) {
346  raw_ngrams[1][ngram_idx].words[1] =
347  raw_ngrams[0][j - 1].words[0];
348  raw_ngrams[1][ngram_idx].words[2] =
349  raw_ngrams[0][j - 1].words[1];
350  ngram_idx++;
351  }
352  }
353  ckd_free(tseg_base);
354 
355  if (ngram_idx < counts[2]) {
356  E_ERROR("Corrupted model, some trigrams have no corresponding bigram\n");
357  ckd_free(bigrams_next);
358  ngrams_raw_free(raw_ngrams, counts, order);
359  return NULL;
360  }
361  }
362  ckd_free(bigrams_next);
363 
364  /* sort raw ngrams for reverse trie */
365  qsort(raw_ngrams[0], (size_t) counts[1], sizeof(*raw_ngrams[0]),
366  &ngram_ord_comparator);
367  if (order > 2) {
368  qsort(raw_ngrams[1], (size_t) counts[2], sizeof(*raw_ngrams[1]),
369  &ngram_ord_comparator);
370  }
371  return raw_ngrams;
372 }
373 
374 void
375 ngrams_raw_free(ngram_raw_t ** raw_ngrams, uint32 * counts, int order)
376 {
377  uint32 num;
378  int order_it;
379 
380  for (order_it = 0; order_it < order - 1; order_it++) {
381  for (num = 0; num < counts[order_it + 1]; num++) {
382  ckd_free(raw_ngrams[order_it][num].words);
383  }
384  ckd_free(raw_ngrams[order_it]);
385  }
386  ckd_free(raw_ngrams);
387 }
Sphinx's memory allocation/deallocation routines.
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
Definition: ckd_alloc.c:244
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
Definition: ckd_alloc.h:248
Implementation of logging routines.
#define E_ERROR(...)
Print error message to error log.
Definition: err.h:104
#define E_WARN(...)
Print warning message to error log.
Definition: err.h:109
SPHINXBASE_EXPORT int32 hash_table_lookup_int32(hash_table_t *h, const char *key, int32 *val)
Look up a 32-bit integer value in a hash table.
Definition: hash_table.c:322
SPHINXBASE_EXPORT float logmath_log10_to_log_float(logmath_t *lmath, float64 log_p)
Convert base 10 log (in floating point) to float log in base B.
Definition: logmath.c:480
file IO related operations.
SPHINXBASE_EXPORT lineiter_t * lineiter_next(lineiter_t *li)
Move to the next line in the file.
Definition: pio.c:347
Miscellaneous useful string functions.
SPHINXBASE_EXPORT int32 str2words(char *line, char **wptr, int32 n_wptr)
Convert a line to an array of "words", based on whitespace separators.
Definition: strfuncs.c:123
SPHINXBASE_EXPORT double atof_c(char const *str)
Locale independent version of atof().
Definition: strfuncs.c:55
Line iterator for files.
Definition: pio.h:177