SphinxBase  5prealpha
sphinx_fe.c
1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3  * Copyright (c) 1996-2004 Carnegie Mellon University. All rights
4  * reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  * notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in
15  * the documentation and/or other materials provided with the
16  * distribution.
17  *
18  * This work was supported in part by funding from the Defense Advanced
19  * Research Projects Agency and the National Science Foundation of the
20  * United States of America, and the CMU Sphinx Speech Consortium.
21  *
22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  *
34  * ====================================================================
35  *
36  */
37 #include <stdio.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <time.h>
41 #include <assert.h>
42 
43 #ifdef HAVE_CONFIG_H
44 #include <config.h>
45 #endif
46 
47 #include <sphinxbase/fe.h>
48 #include <sphinxbase/strfuncs.h>
49 #include <sphinxbase/pio.h>
50 #include <sphinxbase/filename.h>
51 #include <sphinxbase/cmd_ln.h>
52 #include <sphinxbase/err.h>
53 #include <sphinxbase/ckd_alloc.h>
54 #include <sphinxbase/byteorder.h>
55 #include <sphinxbase/hash_table.h>
56 
57 #include "sphinx_wave2feat.h"
58 #include "cmd_ln_defn.h"
59 
60 typedef struct audio_type_s {
61  char const *name;
62  int (*detect)(sphinx_wave2feat_t *wtf);
63  int (*decode)(sphinx_wave2feat_t *wtf);
64 } audio_type_t;
65 
66 typedef struct output_type_s {
67  char const *name;
68  int (*output_header)(sphinx_wave2feat_t *wtf, int nfloat);
69  int (*output_frames)(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr);
71 
73  int refcount;
75  fe_t *fe;
76  char *infile;
77  char *outfile;
78  FILE *infh;
79  FILE *outfh;
80  short *audio;
81  mfcc_t **feat;
82  int blocksize;
83  int featsize;
84  int veclen;
85  int in_veclen;
86  int byteswap;
87  output_type_t const *ot;
88 };
89 
91 typedef struct RIFFHeader{
92  char rifftag[4]; /* "RIFF" string */
93  int32 TotalLength; /* Total length */
94  char wavefmttag[8]; /* "WAVEfmt " string (note space after 't') */
95  int32 RemainingLength; /* Remaining length */
96  int16 data_format; /* data format tag, 1 = PCM */
97  int16 numchannels; /* Number of channels in file */
98  int32 SamplingFreq; /* Sampling frequency */
99  int32 BytesPerSec; /* Average bytes/sec */
100  int16 BlockAlign; /* Block align */
101  int16 BitsPerSample; /* 8 or 16 bit */
102  char datatag[4]; /* "data" string */
103  int32 datalength; /* Raw data length */
104 } MSWAV_hdr;
105 
111 static int
112 detect_riff(sphinx_wave2feat_t *wtf)
113 {
114  FILE *fh;
115  MSWAV_hdr hdr;
116  double samprate;
117 
118  if ((fh = fopen(wtf->infile, "rb")) == NULL) {
119  E_ERROR_SYSTEM("Failed to open %s", wtf->infile);
120  return -1;
121  }
122  if (fread(&hdr, sizeof(hdr), 1, fh) != 1) {
123  E_ERROR("Failed to read RIFF header");
124  fclose(fh);
125  return -1;
126  }
127  /* Make sure it is actually a RIFF file. */
128  if (0 != memcmp(hdr.rifftag, "RIFF", 4)) {
129  fclose(fh);
130  return FALSE;
131  }
132  if (cmd_ln_int32_r(wtf->config, "-nchans") != hdr.numchannels) {
133  E_ERROR("Number of channels %d does not match configured value in file '%s'\n", hdr.numchannels, wtf->infile);
134  fclose(fh);
135  return -1;
136  }
137  samprate = cmd_ln_float32_r(wtf->config, "-samprate");
138  if (samprate != hdr.SamplingFreq) {
139  E_ERROR("Sample rate %d does not match configured value %.1f in file '%s'\n",
140  hdr.SamplingFreq, samprate, wtf->infile);
141  fclose(fh);
142  return -1;
143  }
144  wtf->infh = fh;
145 
146  return TRUE;
147 }
148 
149 static int
150 open_nist_file(sphinx_wave2feat_t *wtf, char const *infile, FILE **out_fh, int detect_endian)
151 {
152  char nist[7];
153  lineiter_t *li;
154  FILE *fh;
155 
156  if ((fh = fopen(infile, "rb")) == NULL) {
157  E_ERROR_SYSTEM("Failed to open %s", infile);
158  return -1;
159  }
160  if (fread(&nist, 1, 7, fh) != 7) {
161  E_ERROR_SYSTEM("Failed to read NIST header");
162  fclose(fh);
163  return -1;
164  }
165  /* Is this actually a NIST file? */
166  if (0 != strncmp(nist, "NIST_1A", 7)) {
167  fclose(fh);
168  return FALSE;
169  }
170  /* Rewind, parse lines. */
171  fseek(fh, 0, SEEK_SET);
172  for (li = lineiter_start(fh); li; li = lineiter_next(li)) {
173  char **words;
174  int nword;
175 
176  string_trim(li->buf, STRING_BOTH);
177  if (strlen(li->buf) == 0) {
178  lineiter_free(li);
179  break;
180  }
181  nword = str2words(li->buf, NULL, 0);
182  if (nword != 3)
183  continue;
184  words = (char **)ckd_calloc(nword, sizeof(*words));
185  str2words(li->buf, words, nword);
186  if (0 == strcmp(words[0], "sample_rate")) {
187  float samprate = atof_c(words[2]);
188  if (cmd_ln_float32_r(wtf->config, "-samprate") != samprate) {
189  E_ERROR("Sample rate %.1f does not match configured value in file '%s'\n", samprate, infile);
190  lineiter_free(li);
191  fclose(fh);
192  return -1;
193  }
194  }
195  if (0 == strcmp(words[0], "channel_count")) {
196  int nchans = atoi(words[2]);
197  if (cmd_ln_int32_r(wtf->config, "-nchans") != nchans) {
198  E_ERROR("Number of channels %d does not match configured value in file '%s'\n", nchans, infile);
199  lineiter_free(li);
200  fclose(fh);
201  return -1;
202  }
203  }
204  if (detect_endian && 0 == strcmp(words[0], "sample_byte_format")) {
205  const char *endian = (0 == strcmp(words[2], "10")) ? "big" : "little";
206  if (0 != strcmp(cmd_ln_str_r(wtf->config, "-input_endian"), endian)) {
207  E_ERROR("Input endian %s does not match configured value in file '%s'\n", endian, infile);
208  lineiter_free(li);
209  fclose(fh);
210  return -1;
211  }
212  }
213  ckd_free(words);
214  }
215 
216  fseek(fh, 1024, SEEK_SET);
217  if (out_fh)
218  *out_fh = fh;
219  else
220  fclose(fh);
221  return TRUE;
222 }
223 
224 #ifdef HAVE_POPEN
225 static int
226 detect_sph2pipe(sphinx_wave2feat_t *wtf)
227 {
228  FILE *fh;
229  char *cmdline;
230  int rv;
231 
232  /* Determine if it's NIST file and get parameters. */
233  if ((rv = open_nist_file(wtf, wtf->infile, NULL, FALSE)) != TRUE)
234  return rv;
235 
236  /* Now popen it with sph2pipe. */
237  cmdline = string_join("sph2pipe -f raw '", wtf->infile, "'", NULL);
238  if ((fh = popen(cmdline, "r")) == NULL) {
239  E_ERROR_SYSTEM("Failed to popen(\"sph2pipe -f raw '%s'\")", wtf->infile);
240  ckd_free(cmdline);
241  return -1;
242  }
243 
244  wtf->infh = fh;
245  return TRUE;
246 }
247 #else /* !HAVE_POPEN */
248 static int
249 detect_sph2pipe(sphinx_wave2feat_t *wtf)
250 {
251  E_ERROR("popen() not available, cannot run sph2pipe\n");
252  return -1;
253 }
254 #endif /* !HAVE_POPEN */
255 
261 static int
262 detect_nist(sphinx_wave2feat_t *wtf)
263 {
264  FILE *fh;
265  int rv;
266 
267  if ((rv = open_nist_file(wtf, wtf->infile, &fh, TRUE)) != TRUE)
268  return rv;
269  wtf->infh = fh;
270 
271  return TRUE;
272 }
273 
274 
281 static int
282 detect_raw(sphinx_wave2feat_t *wtf)
283 {
284  FILE *fh;
285 
286  if ((fh = fopen(wtf->infile, "rb")) == NULL) {
287  E_ERROR_SYSTEM("Failed to open %s", wtf->infile);
288  return -1;
289  }
290  wtf->infh = fh;
291  return TRUE;
292 }
293 
300 static int
301 detect_sphinx_mfc(sphinx_wave2feat_t *wtf)
302 {
303  FILE *fh;
304  int32 len;
305  long flen;
306 
307  if ((fh = fopen(wtf->infile, "rb")) == NULL) {
308  E_ERROR_SYSTEM("Failed to open %s", wtf->infile);
309  return -1;
310  }
311  if (fread(&len, 4, 1, fh) != 1) {
312  E_ERROR_SYSTEM("Failed to read header from %s\n", wtf->infile);
313  fclose(fh);
314  return -1;
315  }
316  fseek(fh, 0, SEEK_END);
317  flen = ftell(fh);
318 
319  /* figure out whether to byteswap */
320  flen = (flen / 4) - 1;
321  if (flen != len) {
322  /* First make sure this is an endianness problem, otherwise fail. */
323  SWAP_INT32(&len);
324  if (flen != len) {
325  SWAP_INT32(&len);
326  E_ERROR("Mismatch in header/file lengths: 0x%08x vs 0x%08x\n",
327  len, flen);
328  fclose(fh);
329  return -1;
330  }
331  /* Set the input endianness to the opposite of the machine endianness... */
332  cmd_ln_set_str_r(wtf->config, "-input_endian",
333  (0 == strcmp("big", cmd_ln_str_r(wtf->config, "-mach_endian"))
334  ? "little" : "big"));
335  }
336 
337  fseek(fh, 4, SEEK_SET);
338  wtf->infh = fh;
339  if (cmd_ln_boolean_r(wtf->config, "-spec2cep")) {
340  wtf->in_veclen = cmd_ln_int32_r(wtf->config, "-nfilt");
341  }
342  else if (cmd_ln_boolean_r(wtf->config, "-cep2spec")) {
343  wtf->in_veclen = cmd_ln_int32_r(wtf->config, "-ncep");
344  wtf->veclen = cmd_ln_int32_r(wtf->config, "-nfilt");
345  }
346  else {
347  /* Should not happen. */
348  E_ERROR("Sphinx MFCC file reading requested but -spec2cep/-cep2spec not given\n");
349  assert(FALSE);
350  }
351 
352  return TRUE;
353 }
354 
355 int
356 mixnpick_channels(int16 *buf, int32 nsamp, int32 nchans, int32 whichchan)
357 {
358  int i, j;
359 
360  if (whichchan > 0) {
361  for (i = whichchan - 1; i < nsamp; i += nchans)
362  buf[i/nchans] = buf[i];
363  }
364  else {
365  for (i = 0; i < nsamp; i += nchans) {
366  float64 tmp = 0.0;
367  for (j = 0; j < nchans && i + j < nsamp; ++j) {
368  tmp += buf[i + j];
369  }
370  buf[i/nchans] = (int16)(tmp / nchans);
371  }
372  }
373  return i/nchans;
374 }
375 
380 static int
381 decode_pcm(sphinx_wave2feat_t *wtf)
382 {
383  size_t nsamp;
384  int32 n, nfr, nchans, whichchan;
385  uint32 nfloat;
386 
387  nchans = cmd_ln_int32_r(wtf->config, "-nchans");
388  whichchan = cmd_ln_int32_r(wtf->config, "-whichchan");
389  fe_start_stream(wtf->fe);
390  fe_start_utt(wtf->fe);
391  nfloat = 0;
392  while ((nsamp = fread(wtf->audio, sizeof(int16), wtf->blocksize, wtf->infh)) != 0) {
393  size_t nvec;
394  int16 const *inspeech;
395 
396  /* Byteswap stuff here if necessary. */
397  if (wtf->byteswap) {
398  for (n = 0; n < nsamp; ++n)
399  SWAP_INT16(wtf->audio + n);
400  }
401 
402  /* Mix or pick channels. */
403  if (nchans > 1)
404  nsamp = mixnpick_channels(wtf->audio, nsamp, nchans, whichchan);
405 
406  inspeech = wtf->audio;
407  nvec = wtf->featsize;
408  /* Consume all samples. */
409  while (nsamp) {
410  nfr = nvec;
411  fe_process_frames(wtf->fe, &inspeech, &nsamp, wtf->feat, &nfr, NULL);
412  if (nfr) {
413  if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
414  return -1;
415  nfloat += n;
416  }
417  }
418  inspeech = wtf->audio;
419  }
420  /* Now process any leftover audio frames. */
421  fe_end_utt(wtf->fe, wtf->feat[0], &nfr);
422  if (nfr) {
423  if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
424  return -1;
425  nfloat += n;
426  }
427 
428  if (fclose(wtf->infh) == EOF)
429  E_ERROR_SYSTEM("Failed to close input file");
430  wtf->infh = NULL;
431  return nfloat;
432 }
433 
438 static int
439 decode_sphinx_mfc(sphinx_wave2feat_t *wtf)
440 {
441  int nfloat = 0, n;
442  int featsize = wtf->featsize;
443 
444  /* If the input vector length is less than the output length, we
445  * need to do this one frame at a time, because there's empty
446  * space at the end of each vector in wtf->feat. */
447  if (wtf->in_veclen < wtf->veclen)
448  featsize = 1;
449  while ((n = fread(wtf->feat[0], sizeof(**wtf->feat),
450  featsize * wtf->in_veclen, wtf->infh)) != 0) {
451  int i, nfr = n / wtf->in_veclen;
452  if (n % wtf->in_veclen) {
453  E_ERROR("Size of file %d not a multiple of veclen %d\n",
454  n, wtf->in_veclen);
455  return -1;
456  }
457  /* Byteswap stuff here if necessary. */
458  if (wtf->byteswap) {
459  for (i = 0; i < n; ++i)
460  SWAP_FLOAT32(wtf->feat[0] + i);
461  }
462  fe_float_to_mfcc(wtf->fe, (float32 **)wtf->feat, wtf->feat, nfr);
463  for (i = 0; i < nfr; ++i) {
464  if (cmd_ln_boolean_r(wtf->config, "-spec2cep")) {
465  if (0 == strcmp(cmd_ln_str_r(wtf->config, "-transform"), "legacy"))
466  fe_logspec_to_mfcc(wtf->fe, wtf->feat[i], wtf->feat[i]);
467  else
468  fe_logspec_dct2(wtf->fe, wtf->feat[i], wtf->feat[i]);
469  }
470  else if (cmd_ln_boolean_r(wtf->config, "-cep2spec")) {
471  fe_mfcc_dct3(wtf->fe, wtf->feat[i], wtf->feat[i]);
472  }
473  }
474  if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
475  return -1;
476  nfloat += n;
477  }
478 
479  if (fclose(wtf->infh) == EOF)
480  E_ERROR_SYSTEM("Failed to close input file");
481  wtf->infh = NULL;
482  return nfloat;
483 }
484 
485 static const audio_type_t types[] = {
486  { "-mswav", &detect_riff, &decode_pcm },
487  { "-nist", &detect_nist, &decode_pcm },
488  { "-raw", &detect_raw, &decode_pcm },
489  { "-sph2pipe", &detect_sph2pipe, &decode_pcm }
490 };
491 static const int ntypes = sizeof(types)/sizeof(types[0]);
492 static const audio_type_t mfcc_type = {
493  "sphinx_mfc", &detect_sphinx_mfc, &decode_sphinx_mfc
494 };
495 
501 static int
502 output_header_sphinx(sphinx_wave2feat_t *wtf, int32 nfloat)
503 {
504  if (fwrite(&nfloat, 4, 1, wtf->outfh) != 1) {
505  E_ERROR_SYSTEM("Failed to write to %s", wtf->outfile);
506  return -1;
507  }
508  return 0;
509 }
510 
516 static int
517 output_frames_sphinx(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr)
518 {
519  int i, nfloat = 0;
520 
521  fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr);
522  for (i = 0; i < nfr; ++i) {
523  if (fwrite(frames[i], sizeof(float32), wtf->veclen, wtf->outfh) != wtf->veclen) {
524  E_ERROR_SYSTEM("Writing %d values to %s failed",
525  wtf->veclen, wtf->outfile);
526  return -1;
527  }
528  nfloat += wtf->veclen;
529  }
530  return nfloat;
531 }
532 
533 typedef enum htk_feature_kind_e {
534  WAVEFORM = 0, /* PCM audio (rarely used) */
535  LPC = 1, /* LPC filter coefficients */
536  LPCREFC = 2, /* LPC reflection coefficients */
537  LPCEPSTRA = 3, /* LPC-based cepstral coefficients */
538  LPCDELCEP = 4, /* LPCC plus deltas */
539  IREFC = 5, /* 16-bit integer LPC reflection coefficients */
540  MFCC = 6, /* MFCCs */
541  FBANK = 7, /* Log mel spectrum */
542  MELSPEC = 8, /* Linear mel spectrum */
543  USER = 9, /* User defined */
544  DISCRETE = 10, /* Vector quantized data */
545  PLP = 11 /* PLP coefficients */
546 } htk_feature_kind_t;
547 
548 typedef enum htk_feature_flag_e {
549  _E = 0000100, /* has energy */
550  _N = 0000200, /* absolute energy supressed */
551  _D = 0000400, /* has delta coefficients */
552  _A = 0001000, /* has acceleration (delta-delta) coefficients */
553  _C = 0002000, /* is compressed */
554  _Z = 0004000, /* has zero mean static coefficients (i.e. CMN) */
555  _K = 0010000, /* has CRC checksum */
556  _O = 0020000, /* has 0th cepstral coefficient */
557  _V = 0040000, /* has VQ data */
558  _T = 0100000 /* has third differential coefficients */
559 } htk_feature_flag_t;
560 
564 static int
565 output_header_htk(sphinx_wave2feat_t *wtf, int32 nfloat)
566 {
567  int32 samp_period;
568  int16 samp_size;
569  int16 param_kind;
570  int swap = FALSE;
571 
572  /* HTK files are big-endian. */
573  if (0 == strcmp("little", cmd_ln_str_r(wtf->config, "-mach_endian")))
574  swap = TRUE;
575  /* Same file size thing as in Sphinx files (I think) */
576  if (swap) SWAP_INT32(&nfloat);
577  if (fwrite(&nfloat, 4, 1, wtf->outfh) != 1)
578  return -1;
579  /* Sample period in 100ns units. */
580  samp_period = (int32)(1e+7 / cmd_ln_float32_r(wtf->config, "-frate"));
581  if (swap) SWAP_INT32(&samp_period);
582  if (fwrite(&samp_period, 4, 1, wtf->outfh) != 1)
583  return -1;
584  /* Sample size - veclen * sizeof each sample. */
585  samp_size = wtf->veclen * 4;
586  if (swap) SWAP_INT16(&samp_size);
587  if (fwrite(&samp_size, 2, 1, wtf->outfh) != 1)
588  return -1;
589  /* Format and flags. */
590  if (cmd_ln_boolean_r(wtf->config, "-logspec")
591  || cmd_ln_boolean_r(wtf->config, "-cep2spec"))
592  param_kind = FBANK; /* log mel-filter bank outputs */
593  else
594  param_kind = MFCC | _O; /* MFCC + CEP0 (note reordering...) */
595  if (swap) SWAP_INT16(&param_kind);
596  if (fwrite(&param_kind, 2, 1, wtf->outfh) != 1)
597  return -1;
598 
599  return 0;
600 }
601 
605 static int
606 output_frames_htk(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr)
607 {
608  int i, j, swap, htk_reorder, nfloat = 0;
609 
610  fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr);
611  /* This is possibly inefficient, but probably not a big deal. */
612  swap = (0 == strcmp("little", cmd_ln_str_r(wtf->config, "-mach_endian")));
613  htk_reorder = (0 == strcmp("htk", wtf->ot->name)
614  && !(cmd_ln_boolean_r(wtf->config, "-logspec")
615  || cmd_ln_boolean_r(wtf->config, "-cep2spec")));
616  for (i = 0; i < nfr; ++i) {
617  if (htk_reorder) {
618  mfcc_t c0 = frames[i][0];
619  memmove(frames[i] + 1, frames[i], (wtf->veclen - 1) * 4);
620  frames[i][wtf->veclen - 1] = c0;
621  }
622  if (swap)
623  for (j = 0; j < wtf->veclen; ++j)
624  SWAP_FLOAT32(frames[i] + j);
625  if (fwrite(frames[i], sizeof(float32), wtf->veclen, wtf->outfh) != wtf->veclen) {
626  E_ERROR_SYSTEM("Writing %d values to %s failed",
627  wtf->veclen, wtf->outfile);
628  return -1;
629  }
630  nfloat += wtf->veclen;
631  }
632  return nfloat;
633 }
634 
638 static int
639 output_frames_text(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr)
640 {
641  int i, j, nfloat = 0;
642 
643  fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr);
644  for (i = 0; i < nfr; ++i) {
645  for (j = 0; j < wtf->veclen; ++j) {
646  fprintf(wtf->outfh, "%.5g", MFCC2FLOAT(frames[i][j]));
647  if (j == wtf->veclen - 1)
648  fprintf(wtf->outfh, "\n");
649  else
650  fprintf(wtf->outfh, " ");
651  }
652  nfloat += wtf->veclen;
653  }
654  return nfloat;
655 }
656 
657 static const output_type_t outtypes[] = {
658  { "sphinx", &output_header_sphinx, &output_frames_sphinx },
659  { "htk", &output_header_htk, &output_frames_htk },
660  { "text", NULL, &output_frames_text }
661 };
662 static const int nouttypes = sizeof(outtypes)/sizeof(outtypes[0]);
663 
665 sphinx_wave2feat_init(cmd_ln_t *config)
666 {
667  sphinx_wave2feat_t *wtf;
668  int i;
669 
670  wtf = (sphinx_wave2feat_t *)ckd_calloc(1, sizeof(*wtf));
671  wtf->refcount = 1;
672  wtf->config = cmd_ln_retain(config);
673  wtf->fe = fe_init_auto_r(wtf->config);
674  if (!wtf->fe) {
675  E_FATAL("Failed to create feature extraction\n");
676  }
677 
678  wtf->ot = outtypes; /* Default (sphinx) type. */
679  for (i = 0; i < nouttypes; ++i) {
680  output_type_t const *otype = &outtypes[i];
681  if (0 == strcmp(cmd_ln_str_r(config, "-ofmt"), otype->name)) {
682  wtf->ot = otype;
683  break;
684  }
685  }
686  if (i == nouttypes) {
687  E_ERROR("Unknown output type: '%s'\n",
688  cmd_ln_str_r(config, "-ofmt"));
689  sphinx_wave2feat_free(wtf);
690  return NULL;
691  }
692 
693  return wtf;
694 }
695 
696 int
697 sphinx_wave2feat_free(sphinx_wave2feat_t *wtf)
698 {
699  if (wtf == NULL)
700  return 0;
701  if (--wtf->refcount > 0)
702  return wtf->refcount;
703 
704  if (wtf->audio)
705  ckd_free(wtf->audio);
706  if (wtf->feat)
707  ckd_free_2d(wtf->feat);
708  if (wtf->infile)
709  ckd_free(wtf->infile);
710  if (wtf->outfile)
711  ckd_free(wtf->outfile);
712  if (wtf->infh) {
713  if (fclose(wtf->infh) == EOF)
714  E_ERROR_SYSTEM("Failed to close input file");
715  }
716  if (wtf->outfh) {
717  if (fclose(wtf->outfh) == EOF)
718  E_ERROR_SYSTEM("Failed to close output file");
719  }
720  cmd_ln_free_r(wtf->config);
721  fe_free(wtf->fe);
722  ckd_free(wtf);
723 
724  return 0;
725 }
726 
728 sphinx_wave2feat_retain(sphinx_wave2feat_t *wtf)
729 {
730  ++wtf->refcount;
731  return wtf;
732 }
733 
734 static audio_type_t const *
735 detect_audio_type(sphinx_wave2feat_t *wtf)
736 {
737  audio_type_t const *atype = NULL;
738  int i;
739 
740  /* Special case audio type for Sphinx MFCC inputs. */
741  if (cmd_ln_boolean_r(wtf->config, "-spec2cep")
742  || cmd_ln_boolean_r(wtf->config, "-cep2spec")) {
743  int rv = mfcc_type.detect(wtf);
744  if (rv == -1)
745  goto error_out;
746  return &mfcc_type;
747  }
748 
749  /* Try to use the type of infile given on the command line. */
750  for (i = 0; i < ntypes; ++i) {
751  int rv;
752  atype = &types[i];
753  if (cmd_ln_boolean_r(wtf->config, atype->name)) {
754  rv = (*atype->detect)(wtf);
755  if (rv == -1)
756  goto error_out;
757  else if (rv == TRUE)
758  break;
759  }
760  }
761  if (i == ntypes) {
762  /* Detect file type of infile and get parameters. */
763  for (i = 0; i < ntypes; ++i) {
764  int rv;
765  atype = &types[i];
766  rv = (*atype->detect)(wtf);
767  if (rv == -1)
768  goto error_out;
769  else if (rv == TRUE)
770  break;
771  }
772  if (i == ntypes)
773  goto error_out;
774  }
775  return atype;
776  error_out:
777  if (wtf->infh)
778  fclose(wtf->infh);
779  wtf->infh = NULL;
780  return NULL;
781 }
782 
783 int
784 sphinx_wave2feat_convert_file(sphinx_wave2feat_t *wtf,
785  char const *infile, char const *outfile)
786 {
787  int nchans, nfloat, veclen;
788  audio_type_t const *atype = NULL;
789  int fshift, fsize;
790 
791  E_INFO("Converting %s to %s\n", infile, outfile);
792 
793  wtf->infile = ckd_salloc(infile);
794 
795  /* Detect input file type. */
796  if ((atype = detect_audio_type(wtf)) == NULL)
797  return -1;
798 
799  /* Determine whether to byteswap input. */
800  wtf->byteswap = strcmp(cmd_ln_str_r(wtf->config, "-mach_endian"),
801  cmd_ln_str_r(wtf->config, "-input_endian"));
802 
803  /* Get the output frame size (if not already set). */
804  if (wtf->veclen == 0)
805  wtf->veclen = fe_get_output_size(wtf->fe);
806 
807  /* Set up the input and output buffers. */
808  fe_get_input_size(wtf->fe, &fshift, &fsize);
809  /* Want to get at least a whole frame plus shift in here. Also we
810  will either pick or mix multiple channels so we need to read
811  them all at once. */
812  nchans = cmd_ln_int32_r(wtf->config, "-nchans");
813  wtf->blocksize = cmd_ln_int32_r(wtf->config, "-blocksize") * nchans;
814  if (wtf->blocksize < (fsize + fshift) * nchans) {
815  E_INFO("Block size of %d too small, increasing to %d\n",
816  wtf->blocksize,
817  (fsize + fshift) * nchans);
818  wtf->blocksize = (fsize + fshift) * nchans;
819  }
820  wtf->audio = (short *)ckd_calloc(wtf->blocksize, sizeof(*wtf->audio));
821  wtf->featsize = (wtf->blocksize / nchans - fsize) / fshift;
822 
823  /* Use the maximum of the input and output frame sizes to allocate this. */
824  veclen = wtf->veclen;
825  if (wtf->in_veclen > veclen) veclen = wtf->in_veclen;
826 
827  wtf->feat = (mfcc_t**)ckd_calloc_2d(wtf->featsize, veclen, sizeof(**wtf->feat));
828 
829  /* Let's go! */
830  if ((wtf->outfh = fopen(outfile, "wb")) == NULL) {
831  E_ERROR_SYSTEM("Failed to open %s for writing", outfile);
832  return -1;
833  }
834  /* Write an empty header, which we'll fill in later. */
835  if (wtf->ot->output_header &&
836  (*wtf->ot->output_header)(wtf, 0) < 0) {
837  E_ERROR_SYSTEM("Failed to write empty header to %s\n", outfile);
838  goto error_out;
839  }
840  wtf->outfile = ckd_salloc(outfile);
841 
842  if ((nfloat = (*atype->decode)(wtf)) < 0) {
843  E_ERROR("Failed to convert");
844  goto error_out;
845  }
846 
847  if (wtf->ot->output_header) {
848  if (fseek(wtf->outfh, 0, SEEK_SET) < 0) {
849  E_ERROR_SYSTEM("Failed to seek to beginning of %s\n", outfile);
850  goto error_out;
851  }
852  if ((*wtf->ot->output_header)(wtf, nfloat) < 0) {
853  E_ERROR_SYSTEM("Failed to write header to %s\n", outfile);
854  goto error_out;
855  }
856  }
857 
858 
859  if (wtf->audio)
860  ckd_free(wtf->audio);
861  if (wtf->feat)
862  ckd_free_2d(wtf->feat);
863  if (wtf->infile)
864  ckd_free(wtf->infile);
865  if (wtf->outfile)
866  ckd_free(wtf->outfile);
867 
868  wtf->audio = NULL;
869  wtf->infile = NULL;
870  wtf->feat = NULL;
871  wtf->outfile = NULL;
872 
873  if (wtf->outfh)
874  if (fclose(wtf->outfh) == EOF)
875  E_ERROR_SYSTEM("Failed to close output file");
876  wtf->outfh = NULL;
877 
878  return 0;
879 
880 error_out:
881 
882  if (wtf->audio)
883  ckd_free(wtf->audio);
884  if (wtf->feat)
885  ckd_free_2d(wtf->feat);
886  if (wtf->infile)
887  ckd_free(wtf->infile);
888  if (wtf->outfile)
889  ckd_free(wtf->outfile);
890 
891  wtf->audio = NULL;
892  wtf->infile = NULL;
893  wtf->feat = NULL;
894  wtf->outfile = NULL;
895 
896  if (wtf->outfh)
897  if (fclose(wtf->outfh) == EOF)
898  E_ERROR_SYSTEM("Failed to close output file");
899  wtf->outfh = NULL;
900 
901  return -1;
902 }
903 
904 void
905 build_filenames(cmd_ln_t *config, char const *basename,
906  char **out_infile, char **out_outfile)
907 {
908  char const *di, *do_, *ei, *eo;
909 
910  di = cmd_ln_str_r(config, "-di");
911  do_ = cmd_ln_str_r(config, "-do");
912  ei = cmd_ln_str_r(config, "-ei");
913  eo = cmd_ln_str_r(config, "-eo");
914 
915  *out_infile = string_join(di ? di : "",
916  di ? "/" : "",
917  basename,
918  ei ? "." : "",
919  ei ? ei : "",
920  NULL);
921  *out_outfile = string_join(do_ ? do_ : "",
922  do_ ? "/" : "",
923  basename,
924  eo ? "." : "",
925  eo ? eo : "",
926  NULL);
927  /* Build output directory structure if possible/requested (it is
928  * by default). */
929  if (cmd_ln_boolean_r(config, "-build_outdirs")) {
930  char *dirname = ckd_salloc(*out_outfile);
931  path2dirname(*out_outfile, dirname);
932  build_directory(dirname);
933  ckd_free(dirname);
934  }
935 }
936 
937 static int
938 run_control_file(sphinx_wave2feat_t *wtf, char const *ctlfile)
939 {
940  hash_table_t *files;
941  hash_iter_t *itor;
942  lineiter_t *li;
943  FILE *ctlfh;
944  int nskip, runlen, npart;
945 
946  if ((ctlfh = fopen(ctlfile, "r")) == NULL) {
947  E_ERROR_SYSTEM("Failed to open control file %s", ctlfile);
948  return -1;
949  }
950  nskip = cmd_ln_int32_r(wtf->config, "-nskip");
951  runlen = cmd_ln_int32_r(wtf->config, "-runlen");
952  if ((npart = cmd_ln_int32_r(wtf->config, "-npart"))) {
953  /* Count lines in the file. */
954  int partlen, part, nlines = 0;
955  part = cmd_ln_int32_r(wtf->config, "-part");
956  for (li = lineiter_start(ctlfh); li; li = lineiter_next(li))
957  ++nlines;
958  fseek(ctlfh, 0, SEEK_SET);
959  partlen = nlines / npart;
960  nskip = partlen * (part - 1);
961  if (part == npart)
962  runlen = -1;
963  else
964  runlen = partlen;
965  }
966  if (runlen != -1){
967  E_INFO("Processing %d utterances at position %d\n", runlen, nskip);
968  files = hash_table_new(runlen, HASH_CASE_YES);
969  }
970  else {
971  E_INFO("Processing all remaining utterances at position %d\n", nskip);
972  files = hash_table_new(1000, HASH_CASE_YES);
973  }
974  for (li = lineiter_start(ctlfh); li; li = lineiter_next(li)) {
975  char *c, *infile, *outfile;
976 
977  if (nskip-- > 0)
978  continue;
979  if (runlen == 0) {
980  lineiter_free(li);
981  break;
982  }
983  --runlen;
984 
985  string_trim(li->buf, STRING_BOTH);
986  /* Extract the file ID from the control line. */
987  if ((c = strchr(li->buf, ' ')) != NULL)
988  *c = '\0';
989  if (strlen(li->buf) == 0) {
990  E_WARN("Empty line %d in control file, skipping\n", li->lineno);
991  continue;
992  }
993  build_filenames(wtf->config, li->buf, &infile, &outfile);
994  if (hash_table_lookup(files, infile, NULL) == 0)
995  continue;
996  sphinx_wave2feat_convert_file(wtf, infile, outfile);
997  hash_table_enter(files, infile, outfile);
998  }
999  for (itor = hash_table_iter(files); itor;
1000  itor = hash_table_iter_next(itor)) {
1001  ckd_free((void *)hash_entry_key(itor->ent));
1002  ckd_free(hash_entry_val(itor->ent));
1003  }
1004  hash_table_free(files);
1005  fclose(ctlfh);
1006 
1007  return 0;
1008 }
1009 
1010 int
1011 main(int argc, char *argv[])
1012 {
1013  sphinx_wave2feat_t *wtf;
1014  cmd_ln_t *config;
1015  int rv;
1016 
1017  config = cmd_ln_parse_r(NULL, defn, argc, argv, TRUE);
1018 
1019  if (config && cmd_ln_str_r(config, "-argfile"))
1020  config = cmd_ln_parse_file_r(config, defn,
1021  cmd_ln_str_r(config, "-argfile"), FALSE);
1022  if (config == NULL) {
1023  E_ERROR("Command line parsing failed\n");
1024  return 1;
1025  }
1026 
1027  if ((wtf = sphinx_wave2feat_init(config)) == NULL) {
1028  E_ERROR("Failed to initialize wave2feat object\n");
1029  return 1;
1030  }
1031 
1032  /* If there's a control file run through it, otherwise we will do
1033  * a single file (which is what run_control_file will do
1034  * internally too) */
1035  if (cmd_ln_str_r(config, "-c"))
1036  rv = run_control_file(wtf, cmd_ln_str_r(config, "-c"));
1037  else
1038  rv = sphinx_wave2feat_convert_file(wtf, cmd_ln_str_r(config, "-i"),
1039  cmd_ln_str_r(config, "-o"));
1040 
1041  sphinx_wave2feat_free(wtf);
1042  cmd_ln_free_r(config);
1043  return rv;
1044 }
Sphinx's memory allocation/deallocation routines.
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
Definition: ckd_alloc.c:244
SPHINXBASE_EXPORT void ckd_free_2d(void *ptr)
Free a 2-D array (ptr) previously allocated by ckd_calloc_2d.
Definition: ckd_alloc.c:255
#define ckd_calloc_2d(d1, d2, sz)
Macro for ckd_calloc_2d
Definition: ckd_alloc.h:270
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
Definition: ckd_alloc.h:248
#define ckd_salloc(ptr)
Macro for ckd_salloc
Definition: ckd_alloc.h:264
Command-line and other configurationparsing and handling.
#define cmd_ln_boolean_r(c, n)
Retrieve a boolean value from a command-line object.
Definition: cmd_ln.h:334
SPHINXBASE_EXPORT int cmd_ln_free_r(cmd_ln_t *cmdln)
Release a command-line argument set and all associated strings.
Definition: cmd_ln.c:1046
SPHINXBASE_EXPORT void cmd_ln_set_str_r(cmd_ln_t *cmdln, char const *name, char const *str)
Set a string in a command-line object.
Definition: cmd_ln.c:989
SPHINXBASE_EXPORT cmd_ln_t * cmd_ln_parse_file_r(cmd_ln_t *inout_cmdln, arg_t const *defn, char const *filename, int32 strict)
Parse an arguments file by deliminating on " \r\t\n" and putting each tokens into an argv[] for cmd_l...
Definition: cmd_ln.c:764
SPHINXBASE_EXPORT cmd_ln_t * cmd_ln_retain(cmd_ln_t *cmdln)
Retain ownership of a command-line argument set.
Definition: cmd_ln.c:1039
SPHINXBASE_EXPORT cmd_ln_t * cmd_ln_parse_r(cmd_ln_t *inout_cmdln, arg_t const *defn, int32 argc, char *argv[], int32 strict)
Parse a list of strings into argumetns.
Definition: cmd_ln.c:556
SPHINXBASE_EXPORT char const * cmd_ln_str_r(cmd_ln_t *cmdln, char const *name)
Retrieve a string from a command-line object.
Definition: cmd_ln.c:949
Implementation of logging routines.
#define E_ERROR(...)
Print error message to error log.
Definition: err.h:104
#define E_INFO(...)
Print logging information to standard error stream.
Definition: err.h:114
#define E_FATAL(...)
Exit with non-zero status after error message.
Definition: err.h:81
#define E_ERROR_SYSTEM(...)
Print error text; Call perror("");.
Definition: err.h:99
#define E_WARN(...)
Print warning message to error log.
Definition: err.h:109
File names related operation.
SPHINXBASE_EXPORT void path2dirname(const char *path, char *dir)
Strip off filename from the given path and copy the directory name into dir Caller must have allocate...
Definition: filename.c:68
Hash table implementation.
SPHINXBASE_EXPORT void hash_table_free(hash_table_t *h)
Free the specified hash table; the caller is responsible for freeing the key strings pointed to by th...
Definition: hash_table.c:688
SPHINXBASE_EXPORT hash_table_t * hash_table_new(int32 size, int32 casearg)
Allocate a new hash table for a given expected size.
Definition: hash_table.c:158
SPHINXBASE_EXPORT int32 hash_table_lookup(hash_table_t *h, const char *key, void **val)
Look up a key in a hash table and optionally return the associated value.
Definition: hash_table.c:302
SPHINXBASE_EXPORT hash_iter_t * hash_table_iter(hash_table_t *h)
Start iterating over key-value pairs in a hash table.
Definition: hash_table.c:646
#define hash_entry_val(e)
Access macros.
Definition: hash_table.h:175
SPHINXBASE_EXPORT hash_iter_t * hash_table_iter_next(hash_iter_t *itor)
Get the next key-value pair in iteration.
Definition: hash_table.c:656
SPHINXBASE_EXPORT void * hash_table_enter(hash_table_t *h, const char *key, void *val)
Try to add a new entry with given key and associated value to hash table h.
Definition: hash_table.c:501
file IO related operations.
SPHINXBASE_EXPORT lineiter_t * lineiter_start(FILE *fh)
Start reading lines from a file.
Definition: pio.c:264
SPHINXBASE_EXPORT void lineiter_free(lineiter_t *li)
Stop reading lines from a file.
Definition: pio.c:368
SPHINXBASE_EXPORT int build_directory(const char *path)
Create a directory and all of its parent directories, as needed.
Definition: pio.c:621
SPHINXBASE_EXPORT lineiter_t * lineiter_next(lineiter_t *li)
Move to the next line in the file.
Definition: pio.c:347
Miscellaneous useful string functions.
SPHINXBASE_EXPORT char * string_join(const char *base,...)
Concatenate a NULL-terminated argument list of strings, returning a newly allocated string.
Definition: strfuncs.c:70
SPHINXBASE_EXPORT int32 str2words(char *line, char **wptr, int32 n_wptr)
Convert a line to an array of "words", based on whitespace separators.
Definition: strfuncs.c:123
@ STRING_BOTH
Both ends of string.
Definition: strfuncs.h:73
SPHINXBASE_EXPORT double atof_c(char const *str)
Locale independent version of atof().
Definition: strfuncs.c:55
SPHINXBASE_EXPORT char * string_trim(char *string, enum string_edge_e which)
Remove whitespace from a string, modifying it in-place.
Definition: strfuncs.c:97
RIFF 44-byte header structure for MS wav files.
Definition: sphinx_fe.c:91
Opaque structure used to hold the results of command-line parsing.
Structure for the front-end computation.
Definition: fe_internal.h:117
hash_entry_t * ent
Current entry in that table.
Definition: hash_table.h:170
Line iterator for files.
Definition: pio.h:177
int byteswap
Whether byteswapping is necessary.
Definition: sphinx_fe.c:86
int in_veclen
Length of each input vector (for cep<->spec).
Definition: sphinx_fe.c:85
cmd_ln_t * config
Configuration parameters.
Definition: sphinx_fe.c:74
fe_t * fe
Front end object.
Definition: sphinx_fe.c:75
char * infile
Path to input file.
Definition: sphinx_fe.c:76
short * audio
Audio buffer.
Definition: sphinx_fe.c:80
output_type_t const * ot
Output type object.
Definition: sphinx_fe.c:87
char * outfile
Path to output file.
Definition: sphinx_fe.c:77
mfcc_t ** feat
Feature buffer.
Definition: sphinx_fe.c:81
int featsize
Size of feature buffer.
Definition: sphinx_fe.c:83
int veclen
Length of each output vector.
Definition: sphinx_fe.c:84
FILE * outfh
Output file handle.
Definition: sphinx_fe.c:79
FILE * infh
Input file handle.
Definition: sphinx_fe.c:78
int refcount
Reference count.
Definition: sphinx_fe.c:73
int blocksize
Size of audio buffer.
Definition: sphinx_fe.c:82