SphinxBase 5prealpha
sphinx_fe.c
1/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2/* ====================================================================
3 * Copyright (c) 1996-2004 Carnegie Mellon University. All rights
4 * reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * This work was supported in part by funding from the Defense Advanced
19 * Research Projects Agency and the National Science Foundation of the
20 * United States of America, and the CMU Sphinx Speech Consortium.
21 *
22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 *
34 * ====================================================================
35 *
36 */
37#include <stdio.h>
38#include <stdlib.h>
39#include <string.h>
40#include <time.h>
41#include <assert.h>
42
43#ifdef HAVE_CONFIG_H
44#include <config.h>
45#endif
46
47#include <sphinxbase/fe.h>
48#include <sphinxbase/strfuncs.h>
49#include <sphinxbase/pio.h>
50#include <sphinxbase/filename.h>
51#include <sphinxbase/cmd_ln.h>
52#include <sphinxbase/err.h>
54#include <sphinxbase/byteorder.h>
56
57#include "sphinx_wave2feat.h"
58#include "cmd_ln_defn.h"
59
60typedef struct audio_type_s {
61 char const *name;
62 int (*detect)(sphinx_wave2feat_t *wtf);
63 int (*decode)(sphinx_wave2feat_t *wtf);
65
66typedef struct output_type_s {
67 char const *name;
68 int (*output_header)(sphinx_wave2feat_t *wtf, int nfloat);
69 int (*output_frames)(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr);
71
76 char *infile;
77 char *outfile;
78 FILE *infh;
79 FILE *outfh;
80 short *audio;
81 mfcc_t **feat;
84 int veclen;
88};
89
91typedef struct RIFFHeader{
92 char rifftag[4]; /* "RIFF" string */
93 int32 TotalLength; /* Total length */
94 char wavefmttag[8]; /* "WAVEfmt " string (note space after 't') */
95 int32 RemainingLength; /* Remaining length */
96 int16 data_format; /* data format tag, 1 = PCM */
97 int16 numchannels; /* Number of channels in file */
98 int32 SamplingFreq; /* Sampling frequency */
99 int32 BytesPerSec; /* Average bytes/sec */
100 int16 BlockAlign; /* Block align */
101 int16 BitsPerSample; /* 8 or 16 bit */
102 char datatag[4]; /* "data" string */
103 int32 datalength; /* Raw data length */
104} MSWAV_hdr;
105
111static int
112detect_riff(sphinx_wave2feat_t *wtf)
113{
114 FILE *fh;
115 MSWAV_hdr hdr;
116 double samprate;
117
118 if ((fh = fopen(wtf->infile, "rb")) == NULL) {
119 E_ERROR_SYSTEM("Failed to open %s", wtf->infile);
120 return -1;
121 }
122 if (fread(&hdr, sizeof(hdr), 1, fh) != 1) {
123 E_ERROR("Failed to read RIFF header");
124 fclose(fh);
125 return -1;
126 }
127 /* Make sure it is actually a RIFF file. */
128 if (0 != memcmp(hdr.rifftag, "RIFF", 4)) {
129 fclose(fh);
130 return FALSE;
131 }
132 if (cmd_ln_int32_r(wtf->config, "-nchans") != hdr.numchannels) {
133 E_ERROR("Number of channels %d does not match configured value in file '%s'\n", hdr.numchannels, wtf->infile);
134 fclose(fh);
135 return -1;
136 }
137 samprate = cmd_ln_float32_r(wtf->config, "-samprate");
138 if (samprate != hdr.SamplingFreq) {
139 E_ERROR("Sample rate %d does not match configured value %.1f in file '%s'\n",
140 hdr.SamplingFreq, samprate, wtf->infile);
141 fclose(fh);
142 return -1;
143 }
144 wtf->infh = fh;
145
146 return TRUE;
147}
148
149static int
150open_nist_file(sphinx_wave2feat_t *wtf, char const *infile, FILE **out_fh, int detect_endian)
151{
152 char nist[7];
153 lineiter_t *li;
154 FILE *fh;
155
156 if ((fh = fopen(infile, "rb")) == NULL) {
157 E_ERROR_SYSTEM("Failed to open %s", infile);
158 return -1;
159 }
160 if (fread(&nist, 1, 7, fh) != 7) {
161 E_ERROR_SYSTEM("Failed to read NIST header");
162 fclose(fh);
163 return -1;
164 }
165 /* Is this actually a NIST file? */
166 if (0 != strncmp(nist, "NIST_1A", 7)) {
167 fclose(fh);
168 return FALSE;
169 }
170 /* Rewind, parse lines. */
171 fseek(fh, 0, SEEK_SET);
172 for (li = lineiter_start(fh); li; li = lineiter_next(li)) {
173 char **words;
174 int nword;
175
176 string_trim(li->buf, STRING_BOTH);
177 if (strlen(li->buf) == 0) {
178 lineiter_free(li);
179 break;
180 }
181 nword = str2words(li->buf, NULL, 0);
182 if (nword != 3)
183 continue;
184 words = (char **)ckd_calloc(nword, sizeof(*words));
185 str2words(li->buf, words, nword);
186 if (0 == strcmp(words[0], "sample_rate")) {
187 float samprate = atof_c(words[2]);
188 if (cmd_ln_float32_r(wtf->config, "-samprate") != samprate) {
189 E_ERROR("Sample rate %.1f does not match configured value in file '%s'\n", samprate, infile);
190 lineiter_free(li);
191 fclose(fh);
192 return -1;
193 }
194 }
195 if (0 == strcmp(words[0], "channel_count")) {
196 int nchans = atoi(words[2]);
197 if (cmd_ln_int32_r(wtf->config, "-nchans") != nchans) {
198 E_ERROR("Number of channels %d does not match configured value in file '%s'\n", nchans, infile);
199 lineiter_free(li);
200 fclose(fh);
201 return -1;
202 }
203 }
204 if (detect_endian && 0 == strcmp(words[0], "sample_byte_format")) {
205 const char *endian = (0 == strcmp(words[2], "10")) ? "big" : "little";
206 if (0 != strcmp(cmd_ln_str_r(wtf->config, "-input_endian"), endian)) {
207 E_ERROR("Input endian %s does not match configured value in file '%s'\n", endian, infile);
208 lineiter_free(li);
209 fclose(fh);
210 return -1;
211 }
212 }
213 ckd_free(words);
214 }
215
216 fseek(fh, 1024, SEEK_SET);
217 if (out_fh)
218 *out_fh = fh;
219 else
220 fclose(fh);
221 return TRUE;
222}
223
224#ifdef HAVE_POPEN
225static int
226detect_sph2pipe(sphinx_wave2feat_t *wtf)
227{
228 FILE *fh;
229 char *cmdline;
230 int rv;
231
232 /* Determine if it's NIST file and get parameters. */
233 if ((rv = open_nist_file(wtf, wtf->infile, NULL, FALSE)) != TRUE)
234 return rv;
235
236 /* Now popen it with sph2pipe. */
237 cmdline = string_join("sph2pipe -f raw '", wtf->infile, "'", NULL);
238 if ((fh = popen(cmdline, "r")) == NULL) {
239 E_ERROR_SYSTEM("Failed to popen(\"sph2pipe -f raw '%s'\")", wtf->infile);
240 ckd_free(cmdline);
241 return -1;
242 }
243
244 wtf->infh = fh;
245 return TRUE;
246}
247#else /* !HAVE_POPEN */
248static int
249detect_sph2pipe(sphinx_wave2feat_t *wtf)
250{
251 E_ERROR("popen() not available, cannot run sph2pipe\n");
252 return -1;
253}
254#endif /* !HAVE_POPEN */
255
261static int
262detect_nist(sphinx_wave2feat_t *wtf)
263{
264 FILE *fh;
265 int rv;
266
267 if ((rv = open_nist_file(wtf, wtf->infile, &fh, TRUE)) != TRUE)
268 return rv;
269 wtf->infh = fh;
270
271 return TRUE;
272}
273
274
281static int
282detect_raw(sphinx_wave2feat_t *wtf)
283{
284 FILE *fh;
285
286 if ((fh = fopen(wtf->infile, "rb")) == NULL) {
287 E_ERROR_SYSTEM("Failed to open %s", wtf->infile);
288 return -1;
289 }
290 wtf->infh = fh;
291 return TRUE;
292}
293
300static int
301detect_sphinx_mfc(sphinx_wave2feat_t *wtf)
302{
303 FILE *fh;
304 int32 len;
305 long flen;
306
307 if ((fh = fopen(wtf->infile, "rb")) == NULL) {
308 E_ERROR_SYSTEM("Failed to open %s", wtf->infile);
309 return -1;
310 }
311 if (fread(&len, 4, 1, fh) != 1) {
312 E_ERROR_SYSTEM("Failed to read header from %s\n", wtf->infile);
313 fclose(fh);
314 return -1;
315 }
316 fseek(fh, 0, SEEK_END);
317 flen = ftell(fh);
318
319 /* figure out whether to byteswap */
320 flen = (flen / 4) - 1;
321 if (flen != len) {
322 /* First make sure this is an endianness problem, otherwise fail. */
323 SWAP_INT32(&len);
324 if (flen != len) {
325 SWAP_INT32(&len);
326 E_ERROR("Mismatch in header/file lengths: 0x%08x vs 0x%08x\n",
327 len, flen);
328 fclose(fh);
329 return -1;
330 }
331 /* Set the input endianness to the opposite of the machine endianness... */
332 cmd_ln_set_str_r(wtf->config, "-input_endian",
333 (0 == strcmp("big", cmd_ln_str_r(wtf->config, "-mach_endian"))
334 ? "little" : "big"));
335 }
336
337 fseek(fh, 4, SEEK_SET);
338 wtf->infh = fh;
339 if (cmd_ln_boolean_r(wtf->config, "-spec2cep")) {
340 wtf->in_veclen = cmd_ln_int32_r(wtf->config, "-nfilt");
341 }
342 else if (cmd_ln_boolean_r(wtf->config, "-cep2spec")) {
343 wtf->in_veclen = cmd_ln_int32_r(wtf->config, "-ncep");
344 wtf->veclen = cmd_ln_int32_r(wtf->config, "-nfilt");
345 }
346 else {
347 /* Should not happen. */
348 E_ERROR("Sphinx MFCC file reading requested but -spec2cep/-cep2spec not given\n");
349 assert(FALSE);
350 }
351
352 return TRUE;
353}
354
355int
356mixnpick_channels(int16 *buf, int32 nsamp, int32 nchans, int32 whichchan)
357{
358 int i, j;
359
360 if (whichchan > 0) {
361 for (i = whichchan - 1; i < nsamp; i += nchans)
362 buf[i/nchans] = buf[i];
363 }
364 else {
365 for (i = 0; i < nsamp; i += nchans) {
366 float64 tmp = 0.0;
367 for (j = 0; j < nchans && i + j < nsamp; ++j) {
368 tmp += buf[i + j];
369 }
370 buf[i/nchans] = (int16)(tmp / nchans);
371 }
372 }
373 return i/nchans;
374}
375
380static int
381decode_pcm(sphinx_wave2feat_t *wtf)
382{
383 size_t nsamp;
384 int32 n, nfr, nchans, whichchan;
385 uint32 nfloat;
386
387 nchans = cmd_ln_int32_r(wtf->config, "-nchans");
388 whichchan = cmd_ln_int32_r(wtf->config, "-whichchan");
389 fe_start_stream(wtf->fe);
390 fe_start_utt(wtf->fe);
391 nfloat = 0;
392 while ((nsamp = fread(wtf->audio, sizeof(int16), wtf->blocksize, wtf->infh)) != 0) {
393 size_t nvec;
394 int16 const *inspeech;
395
396 /* Byteswap stuff here if necessary. */
397 if (wtf->byteswap) {
398 for (n = 0; n < nsamp; ++n)
399 SWAP_INT16(wtf->audio + n);
400 }
401
402 /* Mix or pick channels. */
403 if (nchans > 1)
404 nsamp = mixnpick_channels(wtf->audio, nsamp, nchans, whichchan);
405
406 inspeech = wtf->audio;
407 nvec = wtf->featsize;
408 /* Consume all samples. */
409 while (nsamp) {
410 nfr = nvec;
411 fe_process_frames(wtf->fe, &inspeech, &nsamp, wtf->feat, &nfr, NULL);
412 if (nfr) {
413 if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
414 return -1;
415 nfloat += n;
416 }
417 }
418 inspeech = wtf->audio;
419 }
420 /* Now process any leftover audio frames. */
421 fe_end_utt(wtf->fe, wtf->feat[0], &nfr);
422 if (nfr) {
423 if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
424 return -1;
425 nfloat += n;
426 }
427
428 if (fclose(wtf->infh) == EOF)
429 E_ERROR_SYSTEM("Failed to close input file");
430 wtf->infh = NULL;
431 return nfloat;
432}
433
438static int
439decode_sphinx_mfc(sphinx_wave2feat_t *wtf)
440{
441 int nfloat = 0, n;
442 int featsize = wtf->featsize;
443
444 /* If the input vector length is less than the output length, we
445 * need to do this one frame at a time, because there's empty
446 * space at the end of each vector in wtf->feat. */
447 if (wtf->in_veclen < wtf->veclen)
448 featsize = 1;
449 while ((n = fread(wtf->feat[0], sizeof(**wtf->feat),
450 featsize * wtf->in_veclen, wtf->infh)) != 0) {
451 int i, nfr = n / wtf->in_veclen;
452 if (n % wtf->in_veclen) {
453 E_ERROR("Size of file %d not a multiple of veclen %d\n",
454 n, wtf->in_veclen);
455 return -1;
456 }
457 /* Byteswap stuff here if necessary. */
458 if (wtf->byteswap) {
459 for (i = 0; i < n; ++i)
460 SWAP_FLOAT32(wtf->feat[0] + i);
461 }
462 fe_float_to_mfcc(wtf->fe, (float32 **)wtf->feat, wtf->feat, nfr);
463 for (i = 0; i < nfr; ++i) {
464 if (cmd_ln_boolean_r(wtf->config, "-spec2cep")) {
465 if (0 == strcmp(cmd_ln_str_r(wtf->config, "-transform"), "legacy"))
466 fe_logspec_to_mfcc(wtf->fe, wtf->feat[i], wtf->feat[i]);
467 else
468 fe_logspec_dct2(wtf->fe, wtf->feat[i], wtf->feat[i]);
469 }
470 else if (cmd_ln_boolean_r(wtf->config, "-cep2spec")) {
471 fe_mfcc_dct3(wtf->fe, wtf->feat[i], wtf->feat[i]);
472 }
473 }
474 if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
475 return -1;
476 nfloat += n;
477 }
478
479 if (fclose(wtf->infh) == EOF)
480 E_ERROR_SYSTEM("Failed to close input file");
481 wtf->infh = NULL;
482 return nfloat;
483}
484
485static const audio_type_t types[] = {
486 { "-mswav", &detect_riff, &decode_pcm },
487 { "-nist", &detect_nist, &decode_pcm },
488 { "-raw", &detect_raw, &decode_pcm },
489 { "-sph2pipe", &detect_sph2pipe, &decode_pcm }
490};
491static const int ntypes = sizeof(types)/sizeof(types[0]);
492static const audio_type_t mfcc_type = {
493 "sphinx_mfc", &detect_sphinx_mfc, &decode_sphinx_mfc
494};
495
501static int
502output_header_sphinx(sphinx_wave2feat_t *wtf, int32 nfloat)
503{
504 if (fwrite(&nfloat, 4, 1, wtf->outfh) != 1) {
505 E_ERROR_SYSTEM("Failed to write to %s", wtf->outfile);
506 return -1;
507 }
508 return 0;
509}
510
516static int
517output_frames_sphinx(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr)
518{
519 int i, nfloat = 0;
520
521 fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr);
522 for (i = 0; i < nfr; ++i) {
523 if (fwrite(frames[i], sizeof(float32), wtf->veclen, wtf->outfh) != wtf->veclen) {
524 E_ERROR_SYSTEM("Writing %d values to %s failed",
525 wtf->veclen, wtf->outfile);
526 return -1;
527 }
528 nfloat += wtf->veclen;
529 }
530 return nfloat;
531}
532
533typedef enum htk_feature_kind_e {
534 WAVEFORM = 0, /* PCM audio (rarely used) */
535 LPC = 1, /* LPC filter coefficients */
536 LPCREFC = 2, /* LPC reflection coefficients */
537 LPCEPSTRA = 3, /* LPC-based cepstral coefficients */
538 LPCDELCEP = 4, /* LPCC plus deltas */
539 IREFC = 5, /* 16-bit integer LPC reflection coefficients */
540 MFCC = 6, /* MFCCs */
541 FBANK = 7, /* Log mel spectrum */
542 MELSPEC = 8, /* Linear mel spectrum */
543 USER = 9, /* User defined */
544 DISCRETE = 10, /* Vector quantized data */
545 PLP = 11 /* PLP coefficients */
546} htk_feature_kind_t;
547
548typedef enum htk_feature_flag_e {
549 _E = 0000100, /* has energy */
550 _N = 0000200, /* absolute energy supressed */
551 _D = 0000400, /* has delta coefficients */
552 _A = 0001000, /* has acceleration (delta-delta) coefficients */
553 _C = 0002000, /* is compressed */
554 _Z = 0004000, /* has zero mean static coefficients (i.e. CMN) */
555 _K = 0010000, /* has CRC checksum */
556 _O = 0020000, /* has 0th cepstral coefficient */
557 _V = 0040000, /* has VQ data */
558 _T = 0100000 /* has third differential coefficients */
559} htk_feature_flag_t;
560
564static int
565output_header_htk(sphinx_wave2feat_t *wtf, int32 nfloat)
566{
567 int32 samp_period;
568 int16 samp_size;
569 int16 param_kind;
570 int swap = FALSE;
571
572 /* HTK files are big-endian. */
573 if (0 == strcmp("little", cmd_ln_str_r(wtf->config, "-mach_endian")))
574 swap = TRUE;
575 /* Same file size thing as in Sphinx files (I think) */
576 if (swap) SWAP_INT32(&nfloat);
577 if (fwrite(&nfloat, 4, 1, wtf->outfh) != 1)
578 return -1;
579 /* Sample period in 100ns units. */
580 samp_period = (int32)(1e+7 / cmd_ln_float32_r(wtf->config, "-frate"));
581 if (swap) SWAP_INT32(&samp_period);
582 if (fwrite(&samp_period, 4, 1, wtf->outfh) != 1)
583 return -1;
584 /* Sample size - veclen * sizeof each sample. */
585 samp_size = wtf->veclen * 4;
586 if (swap) SWAP_INT16(&samp_size);
587 if (fwrite(&samp_size, 2, 1, wtf->outfh) != 1)
588 return -1;
589 /* Format and flags. */
590 if (cmd_ln_boolean_r(wtf->config, "-logspec")
591 || cmd_ln_boolean_r(wtf->config, "-cep2spec"))
592 param_kind = FBANK; /* log mel-filter bank outputs */
593 else
594 param_kind = MFCC | _O; /* MFCC + CEP0 (note reordering...) */
595 if (swap) SWAP_INT16(&param_kind);
596 if (fwrite(&param_kind, 2, 1, wtf->outfh) != 1)
597 return -1;
598
599 return 0;
600}
601
605static int
606output_frames_htk(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr)
607{
608 int i, j, swap, htk_reorder, nfloat = 0;
609
610 fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr);
611 /* This is possibly inefficient, but probably not a big deal. */
612 swap = (0 == strcmp("little", cmd_ln_str_r(wtf->config, "-mach_endian")));
613 htk_reorder = (0 == strcmp("htk", wtf->ot->name)
614 && !(cmd_ln_boolean_r(wtf->config, "-logspec")
615 || cmd_ln_boolean_r(wtf->config, "-cep2spec")));
616 for (i = 0; i < nfr; ++i) {
617 if (htk_reorder) {
618 mfcc_t c0 = frames[i][0];
619 memmove(frames[i] + 1, frames[i], (wtf->veclen - 1) * 4);
620 frames[i][wtf->veclen - 1] = c0;
621 }
622 if (swap)
623 for (j = 0; j < wtf->veclen; ++j)
624 SWAP_FLOAT32(frames[i] + j);
625 if (fwrite(frames[i], sizeof(float32), wtf->veclen, wtf->outfh) != wtf->veclen) {
626 E_ERROR_SYSTEM("Writing %d values to %s failed",
627 wtf->veclen, wtf->outfile);
628 return -1;
629 }
630 nfloat += wtf->veclen;
631 }
632 return nfloat;
633}
634
638static int
639output_frames_text(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr)
640{
641 int i, j, nfloat = 0;
642
643 fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr);
644 for (i = 0; i < nfr; ++i) {
645 for (j = 0; j < wtf->veclen; ++j) {
646 fprintf(wtf->outfh, "%.5g", MFCC2FLOAT(frames[i][j]));
647 if (j == wtf->veclen - 1)
648 fprintf(wtf->outfh, "\n");
649 else
650 fprintf(wtf->outfh, " ");
651 }
652 nfloat += wtf->veclen;
653 }
654 return nfloat;
655}
656
657static const output_type_t outtypes[] = {
658 { "sphinx", &output_header_sphinx, &output_frames_sphinx },
659 { "htk", &output_header_htk, &output_frames_htk },
660 { "text", NULL, &output_frames_text }
661};
662static const int nouttypes = sizeof(outtypes)/sizeof(outtypes[0]);
663
665sphinx_wave2feat_init(cmd_ln_t *config)
666{
668 int i;
669
670 wtf = (sphinx_wave2feat_t *)ckd_calloc(1, sizeof(*wtf));
671 wtf->refcount = 1;
672 wtf->config = cmd_ln_retain(config);
673 wtf->fe = fe_init_auto_r(wtf->config);
674 if (!wtf->fe) {
675 E_FATAL("Failed to create feature extraction\n");
676 }
677
678 wtf->ot = outtypes; /* Default (sphinx) type. */
679 for (i = 0; i < nouttypes; ++i) {
680 output_type_t const *otype = &outtypes[i];
681 if (0 == strcmp(cmd_ln_str_r(config, "-ofmt"), otype->name)) {
682 wtf->ot = otype;
683 break;
684 }
685 }
686 if (i == nouttypes) {
687 E_ERROR("Unknown output type: '%s'\n",
688 cmd_ln_str_r(config, "-ofmt"));
689 sphinx_wave2feat_free(wtf);
690 return NULL;
691 }
692
693 return wtf;
694}
695
696int
697sphinx_wave2feat_free(sphinx_wave2feat_t *wtf)
698{
699 if (wtf == NULL)
700 return 0;
701 if (--wtf->refcount > 0)
702 return wtf->refcount;
703
704 if (wtf->audio)
705 ckd_free(wtf->audio);
706 if (wtf->feat)
707 ckd_free_2d(wtf->feat);
708 if (wtf->infile)
709 ckd_free(wtf->infile);
710 if (wtf->outfile)
711 ckd_free(wtf->outfile);
712 if (wtf->infh) {
713 if (fclose(wtf->infh) == EOF)
714 E_ERROR_SYSTEM("Failed to close input file");
715 }
716 if (wtf->outfh) {
717 if (fclose(wtf->outfh) == EOF)
718 E_ERROR_SYSTEM("Failed to close output file");
719 }
720 cmd_ln_free_r(wtf->config);
721 fe_free(wtf->fe);
722 ckd_free(wtf);
723
724 return 0;
725}
726
728sphinx_wave2feat_retain(sphinx_wave2feat_t *wtf)
729{
730 ++wtf->refcount;
731 return wtf;
732}
733
734static audio_type_t const *
735detect_audio_type(sphinx_wave2feat_t *wtf)
736{
737 audio_type_t const *atype = NULL;
738 int i;
739
740 /* Special case audio type for Sphinx MFCC inputs. */
741 if (cmd_ln_boolean_r(wtf->config, "-spec2cep")
742 || cmd_ln_boolean_r(wtf->config, "-cep2spec")) {
743 int rv = mfcc_type.detect(wtf);
744 if (rv == -1)
745 goto error_out;
746 return &mfcc_type;
747 }
748
749 /* Try to use the type of infile given on the command line. */
750 for (i = 0; i < ntypes; ++i) {
751 int rv;
752 atype = &types[i];
753 if (cmd_ln_boolean_r(wtf->config, atype->name)) {
754 rv = (*atype->detect)(wtf);
755 if (rv == -1)
756 goto error_out;
757 else if (rv == TRUE)
758 break;
759 }
760 }
761 if (i == ntypes) {
762 /* Detect file type of infile and get parameters. */
763 for (i = 0; i < ntypes; ++i) {
764 int rv;
765 atype = &types[i];
766 rv = (*atype->detect)(wtf);
767 if (rv == -1)
768 goto error_out;
769 else if (rv == TRUE)
770 break;
771 }
772 if (i == ntypes)
773 goto error_out;
774 }
775 return atype;
776 error_out:
777 if (wtf->infh)
778 fclose(wtf->infh);
779 wtf->infh = NULL;
780 return NULL;
781}
782
783int
784sphinx_wave2feat_convert_file(sphinx_wave2feat_t *wtf,
785 char const *infile, char const *outfile)
786{
787 int nchans, nfloat, veclen;
788 audio_type_t const *atype = NULL;
789 int fshift, fsize;
790
791 E_INFO("Converting %s to %s\n", infile, outfile);
792
793 wtf->infile = ckd_salloc(infile);
794
795 /* Detect input file type. */
796 if ((atype = detect_audio_type(wtf)) == NULL)
797 return -1;
798
799 /* Determine whether to byteswap input. */
800 wtf->byteswap = strcmp(cmd_ln_str_r(wtf->config, "-mach_endian"),
801 cmd_ln_str_r(wtf->config, "-input_endian"));
802
803 /* Get the output frame size (if not already set). */
804 if (wtf->veclen == 0)
805 wtf->veclen = fe_get_output_size(wtf->fe);
806
807 /* Set up the input and output buffers. */
808 fe_get_input_size(wtf->fe, &fshift, &fsize);
809 /* Want to get at least a whole frame plus shift in here. Also we
810 will either pick or mix multiple channels so we need to read
811 them all at once. */
812 nchans = cmd_ln_int32_r(wtf->config, "-nchans");
813 wtf->blocksize = cmd_ln_int32_r(wtf->config, "-blocksize") * nchans;
814 if (wtf->blocksize < (fsize + fshift) * nchans) {
815 E_INFO("Block size of %d too small, increasing to %d\n",
816 wtf->blocksize,
817 (fsize + fshift) * nchans);
818 wtf->blocksize = (fsize + fshift) * nchans;
819 }
820 wtf->audio = (short *)ckd_calloc(wtf->blocksize, sizeof(*wtf->audio));
821 wtf->featsize = (wtf->blocksize / nchans - fsize) / fshift;
822
823 /* Use the maximum of the input and output frame sizes to allocate this. */
824 veclen = wtf->veclen;
825 if (wtf->in_veclen > veclen) veclen = wtf->in_veclen;
826
827 wtf->feat = (mfcc_t**)ckd_calloc_2d(wtf->featsize, veclen, sizeof(**wtf->feat));
828
829 /* Let's go! */
830 if ((wtf->outfh = fopen(outfile, "wb")) == NULL) {
831 E_ERROR_SYSTEM("Failed to open %s for writing", outfile);
832 return -1;
833 }
834 /* Write an empty header, which we'll fill in later. */
835 if (wtf->ot->output_header &&
836 (*wtf->ot->output_header)(wtf, 0) < 0) {
837 E_ERROR_SYSTEM("Failed to write empty header to %s\n", outfile);
838 goto error_out;
839 }
840 wtf->outfile = ckd_salloc(outfile);
841
842 if ((nfloat = (*atype->decode)(wtf)) < 0) {
843 E_ERROR("Failed to convert");
844 goto error_out;
845 }
846
847 if (wtf->ot->output_header) {
848 if (fseek(wtf->outfh, 0, SEEK_SET) < 0) {
849 E_ERROR_SYSTEM("Failed to seek to beginning of %s\n", outfile);
850 goto error_out;
851 }
852 if ((*wtf->ot->output_header)(wtf, nfloat) < 0) {
853 E_ERROR_SYSTEM("Failed to write header to %s\n", outfile);
854 goto error_out;
855 }
856 }
857
858
859 if (wtf->audio)
860 ckd_free(wtf->audio);
861 if (wtf->feat)
862 ckd_free_2d(wtf->feat);
863 if (wtf->infile)
864 ckd_free(wtf->infile);
865 if (wtf->outfile)
866 ckd_free(wtf->outfile);
867
868 wtf->audio = NULL;
869 wtf->infile = NULL;
870 wtf->feat = NULL;
871 wtf->outfile = NULL;
872
873 if (wtf->outfh)
874 if (fclose(wtf->outfh) == EOF)
875 E_ERROR_SYSTEM("Failed to close output file");
876 wtf->outfh = NULL;
877
878 return 0;
879
880error_out:
881
882 if (wtf->audio)
883 ckd_free(wtf->audio);
884 if (wtf->feat)
885 ckd_free_2d(wtf->feat);
886 if (wtf->infile)
887 ckd_free(wtf->infile);
888 if (wtf->outfile)
889 ckd_free(wtf->outfile);
890
891 wtf->audio = NULL;
892 wtf->infile = NULL;
893 wtf->feat = NULL;
894 wtf->outfile = NULL;
895
896 if (wtf->outfh)
897 if (fclose(wtf->outfh) == EOF)
898 E_ERROR_SYSTEM("Failed to close output file");
899 wtf->outfh = NULL;
900
901 return -1;
902}
903
904void
905build_filenames(cmd_ln_t *config, char const *basename,
906 char **out_infile, char **out_outfile)
907{
908 char const *di, *do_, *ei, *eo;
909
910 di = cmd_ln_str_r(config, "-di");
911 do_ = cmd_ln_str_r(config, "-do");
912 ei = cmd_ln_str_r(config, "-ei");
913 eo = cmd_ln_str_r(config, "-eo");
914
915 *out_infile = string_join(di ? di : "",
916 di ? "/" : "",
917 basename,
918 ei ? "." : "",
919 ei ? ei : "",
920 NULL);
921 *out_outfile = string_join(do_ ? do_ : "",
922 do_ ? "/" : "",
923 basename,
924 eo ? "." : "",
925 eo ? eo : "",
926 NULL);
927 /* Build output directory structure if possible/requested (it is
928 * by default). */
929 if (cmd_ln_boolean_r(config, "-build_outdirs")) {
930 char *dirname = ckd_salloc(*out_outfile);
931 path2dirname(*out_outfile, dirname);
932 build_directory(dirname);
933 ckd_free(dirname);
934 }
935}
936
937static int
938run_control_file(sphinx_wave2feat_t *wtf, char const *ctlfile)
939{
940 hash_table_t *files;
941 hash_iter_t *itor;
942 lineiter_t *li;
943 FILE *ctlfh;
944 int nskip, runlen, npart;
945
946 if ((ctlfh = fopen(ctlfile, "r")) == NULL) {
947 E_ERROR_SYSTEM("Failed to open control file %s", ctlfile);
948 return -1;
949 }
950 nskip = cmd_ln_int32_r(wtf->config, "-nskip");
951 runlen = cmd_ln_int32_r(wtf->config, "-runlen");
952 if ((npart = cmd_ln_int32_r(wtf->config, "-npart"))) {
953 /* Count lines in the file. */
954 int partlen, part, nlines = 0;
955 part = cmd_ln_int32_r(wtf->config, "-part");
956 for (li = lineiter_start(ctlfh); li; li = lineiter_next(li))
957 ++nlines;
958 fseek(ctlfh, 0, SEEK_SET);
959 partlen = nlines / npart;
960 nskip = partlen * (part - 1);
961 if (part == npart)
962 runlen = -1;
963 else
964 runlen = partlen;
965 }
966 if (runlen != -1){
967 E_INFO("Processing %d utterances at position %d\n", runlen, nskip);
968 files = hash_table_new(runlen, HASH_CASE_YES);
969 }
970 else {
971 E_INFO("Processing all remaining utterances at position %d\n", nskip);
972 files = hash_table_new(1000, HASH_CASE_YES);
973 }
974 for (li = lineiter_start(ctlfh); li; li = lineiter_next(li)) {
975 char *c, *infile, *outfile;
976
977 if (nskip-- > 0)
978 continue;
979 if (runlen == 0) {
980 lineiter_free(li);
981 break;
982 }
983 --runlen;
984
985 string_trim(li->buf, STRING_BOTH);
986 /* Extract the file ID from the control line. */
987 if ((c = strchr(li->buf, ' ')) != NULL)
988 *c = '\0';
989 if (strlen(li->buf) == 0) {
990 E_WARN("Empty line %d in control file, skipping\n", li->lineno);
991 continue;
992 }
993 build_filenames(wtf->config, li->buf, &infile, &outfile);
994 if (hash_table_lookup(files, infile, NULL) == 0)
995 continue;
996 sphinx_wave2feat_convert_file(wtf, infile, outfile);
997 hash_table_enter(files, infile, outfile);
998 }
999 for (itor = hash_table_iter(files); itor;
1000 itor = hash_table_iter_next(itor)) {
1001 ckd_free((void *)hash_entry_key(itor->ent));
1002 ckd_free(hash_entry_val(itor->ent));
1003 }
1004 hash_table_free(files);
1005 fclose(ctlfh);
1006
1007 return 0;
1008}
1009
1010int
1011main(int argc, char *argv[])
1012{
1013 sphinx_wave2feat_t *wtf;
1014 cmd_ln_t *config;
1015 int rv;
1016
1017 config = cmd_ln_parse_r(NULL, defn, argc, argv, TRUE);
1018
1019 if (config && cmd_ln_str_r(config, "-argfile"))
1020 config = cmd_ln_parse_file_r(config, defn,
1021 cmd_ln_str_r(config, "-argfile"), FALSE);
1022 if (config == NULL) {
1023 E_ERROR("Command line parsing failed\n");
1024 return 1;
1025 }
1026
1027 if ((wtf = sphinx_wave2feat_init(config)) == NULL) {
1028 E_ERROR("Failed to initialize wave2feat object\n");
1029 return 1;
1030 }
1031
1032 /* If there's a control file run through it, otherwise we will do
1033 * a single file (which is what run_control_file will do
1034 * internally too) */
1035 if (cmd_ln_str_r(config, "-c"))
1036 rv = run_control_file(wtf, cmd_ln_str_r(config, "-c"));
1037 else
1038 rv = sphinx_wave2feat_convert_file(wtf, cmd_ln_str_r(config, "-i"),
1039 cmd_ln_str_r(config, "-o"));
1040
1041 sphinx_wave2feat_free(wtf);
1042 cmd_ln_free_r(config);
1043 return rv;
1044}
Sphinx's memory allocation/deallocation routines.
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
Definition: ckd_alloc.c:244
SPHINXBASE_EXPORT void ckd_free_2d(void *ptr)
Free a 2-D array (ptr) previously allocated by ckd_calloc_2d.
Definition: ckd_alloc.c:255
#define ckd_calloc_2d(d1, d2, sz)
Macro for ckd_calloc_2d
Definition: ckd_alloc.h:270
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
Definition: ckd_alloc.h:248
#define ckd_salloc(ptr)
Macro for ckd_salloc
Definition: ckd_alloc.h:264
Command-line and other configurationparsing and handling.
#define cmd_ln_boolean_r(c, n)
Retrieve a boolean value from a command-line object.
Definition: cmd_ln.h:334
SPHINXBASE_EXPORT int cmd_ln_free_r(cmd_ln_t *cmdln)
Release a command-line argument set and all associated strings.
Definition: cmd_ln.c:1046
SPHINXBASE_EXPORT void cmd_ln_set_str_r(cmd_ln_t *cmdln, char const *name, char const *str)
Set a string in a command-line object.
Definition: cmd_ln.c:989
SPHINXBASE_EXPORT cmd_ln_t * cmd_ln_parse_file_r(cmd_ln_t *inout_cmdln, arg_t const *defn, char const *filename, int32 strict)
Parse an arguments file by deliminating on " \r\t\n" and putting each tokens into an argv[] for cmd_l...
Definition: cmd_ln.c:764
SPHINXBASE_EXPORT char const * cmd_ln_str_r(cmd_ln_t *cmdln, char const *name)
Retrieve a string from a command-line object.
Definition: cmd_ln.c:949
SPHINXBASE_EXPORT cmd_ln_t * cmd_ln_retain(cmd_ln_t *cmdln)
Retain ownership of a command-line argument set.
Definition: cmd_ln.c:1039
SPHINXBASE_EXPORT cmd_ln_t * cmd_ln_parse_r(cmd_ln_t *inout_cmdln, arg_t const *defn, int32 argc, char *argv[], int32 strict)
Parse a list of strings into argumetns.
Definition: cmd_ln.c:556
Implementation of logging routines.
#define E_ERROR(...)
Print error message to error log.
Definition: err.h:104
#define E_INFO(...)
Print logging information to standard error stream.
Definition: err.h:114
#define E_FATAL(...)
Exit with non-zero status after error message.
Definition: err.h:81
#define E_ERROR_SYSTEM(...)
Print error text; Call perror("");.
Definition: err.h:99
#define E_WARN(...)
Print warning message to error log.
Definition: err.h:109
File names related operation.
SPHINXBASE_EXPORT void path2dirname(const char *path, char *dir)
Strip off filename from the given path and copy the directory name into dir Caller must have allocate...
Definition: filename.c:68
Hash table implementation.
SPHINXBASE_EXPORT void hash_table_free(hash_table_t *h)
Free the specified hash table; the caller is responsible for freeing the key strings pointed to by th...
Definition: hash_table.c:688
SPHINXBASE_EXPORT hash_iter_t * hash_table_iter_next(hash_iter_t *itor)
Get the next key-value pair in iteration.
Definition: hash_table.c:656
SPHINXBASE_EXPORT int32 hash_table_lookup(hash_table_t *h, const char *key, void **val)
Look up a key in a hash table and optionally return the associated value.
Definition: hash_table.c:302
#define hash_entry_val(e)
Access macros.
Definition: hash_table.h:175
SPHINXBASE_EXPORT void * hash_table_enter(hash_table_t *h, const char *key, void *val)
Try to add a new entry with given key and associated value to hash table h.
Definition: hash_table.c:501
SPHINXBASE_EXPORT hash_iter_t * hash_table_iter(hash_table_t *h)
Start iterating over key-value pairs in a hash table.
Definition: hash_table.c:646
SPHINXBASE_EXPORT hash_table_t * hash_table_new(int32 size, int32 casearg)
Allocate a new hash table for a given expected size.
Definition: hash_table.c:158
file IO related operations.
SPHINXBASE_EXPORT void lineiter_free(lineiter_t *li)
Stop reading lines from a file.
Definition: pio.c:368
SPHINXBASE_EXPORT int build_directory(const char *path)
Create a directory and all of its parent directories, as needed.
Definition: pio.c:621
SPHINXBASE_EXPORT lineiter_t * lineiter_start(FILE *fh)
Start reading lines from a file.
Definition: pio.c:264
SPHINXBASE_EXPORT lineiter_t * lineiter_next(lineiter_t *li)
Move to the next line in the file.
Definition: pio.c:347
Miscellaneous useful string functions.
SPHINXBASE_EXPORT char * string_trim(char *string, enum string_edge_e which)
Remove whitespace from a string, modifying it in-place.
Definition: strfuncs.c:97
SPHINXBASE_EXPORT char * string_join(const char *base,...)
Concatenate a NULL-terminated argument list of strings, returning a newly allocated string.
Definition: strfuncs.c:70
SPHINXBASE_EXPORT int32 str2words(char *line, char **wptr, int32 n_wptr)
Convert a line to an array of "words", based on whitespace separators.
Definition: strfuncs.c:123
@ STRING_BOTH
Both ends of string.
Definition: strfuncs.h:73
SPHINXBASE_EXPORT double atof_c(char const *str)
Locale independent version of atof().
Definition: strfuncs.c:55
RIFF 44-byte header structure for MS wav files.
Definition: sphinx_fe.c:91
Opaque structure used to hold the results of command-line parsing.
Structure for the front-end computation.
Definition: fe_internal.h:117
hash_entry_t * ent
Current entry in that table.
Definition: hash_table.h:170
Line iterator for files.
Definition: pio.h:177
int byteswap
Whether byteswapping is necessary.
Definition: sphinx_fe.c:86
int in_veclen
Length of each input vector (for cep<->spec).
Definition: sphinx_fe.c:85
cmd_ln_t * config
Configuration parameters.
Definition: sphinx_fe.c:74
fe_t * fe
Front end object.
Definition: sphinx_fe.c:75
char * infile
Path to input file.
Definition: sphinx_fe.c:76
short * audio
Audio buffer.
Definition: sphinx_fe.c:80
output_type_t const * ot
Output type object.
Definition: sphinx_fe.c:87
char * outfile
Path to output file.
Definition: sphinx_fe.c:77
mfcc_t ** feat
Feature buffer.
Definition: sphinx_fe.c:81
int featsize
Size of feature buffer.
Definition: sphinx_fe.c:83
int veclen
Length of each output vector.
Definition: sphinx_fe.c:84
FILE * outfh
Output file handle.
Definition: sphinx_fe.c:79
FILE * infh
Input file handle.
Definition: sphinx_fe.c:78
int refcount
Reference count.
Definition: sphinx_fe.c:73
int blocksize
Size of audio buffer.
Definition: sphinx_fe.c:82