| Home | Trees | Indices | Help |
|
|---|
|
|
1 # Natural Language Toolkit: Genesis Corpus Reader
2 #
3 # Copyright (C) 2001-2007 University of Pennsylvania
4 # Author: Steven Bird <sb@ldc.upenn.edu>
5 # URL: <http://nltk.sf.net>
6 # For license information, see LICENSE.TXT
7
8 """
9 The Carnegie Mellon Pronouncing Dictionary [cmudict.0.6]
10 ftp://ftp.cs.cmu.edu/project/speech/dict/
11 Copyright 1998 Carnegie Mellon University
12
13 File Format: Each line consists of an uppercased word, a counter
14 (for alternative pronunciations), and a transcription. Vowels are
15 marked for stress (1=primary, 2=secondary, 0=no stress). E.g.:
16 NATURAL 1 N AE1 CH ER0 AH0 L
17
18 The dictionary contains 127069 entries. Of these, 119400 words are assigned
19 a unique pronunciation, 6830 words have two pronunciations, and 839 words have
20 three or more pronunciations. Many of these are fast-speech variants.
21
22 Phonemes: There are 39 phonemes, as shown below:
23
24 Phoneme Example Translation Phoneme Example Translation
25 ------- ------- ----------- ------- ------- -----------
26 AA odd AA D AE at AE T
27 AH hut HH AH T AO ought AO T
28 AW cow K AW AY hide HH AY D
29 B be B IY CH cheese CH IY Z
30 D dee D IY DH thee DH IY
31 EH Ed EH D ER hurt HH ER T
32 EY ate EY T F fee F IY
33 G green G R IY N HH he HH IY
34 IH it IH T IY eat IY T
35 JH gee JH IY K key K IY
36 L lee L IY M me M IY
37 N knee N IY NG ping P IH NG
38 OW oat OW T OY toy T OY
39 P pee P IY R read R IY D
40 S sea S IY SH she SH IY
41 T tea T IY TH theta TH EY T AH
42 UH hood HH UH D UW two T UW
43 V vee V IY W we W IY
44 Y yield Y IY L D Z zee Z IY
45 ZH seizure S IY ZH ER
46 """
47
48 from nltk_lite.corpora import get_basedir
49 import os
50
51 items = [
52 'cmudict']
53
54 item_name = {
55 'cmudict': 'CMU Pronunciation Dictionary, Version 0.6, 1998',
56 }
57
59 """
60 @param files: One or more cmudict files to be processed
61 @type files: L{string} or L{tuple(string)}
62 @rtype: iterator over L{tree}
63 """
64
65 # Just one file to process? If so convert to a tuple so we can iterate
66 if type(files) is str: files = (files,)
67
68 for file in files:
69 path = os.path.join(get_basedir(), "cmudict", file)
70 for line in open(path).readlines():
71 fields = line.strip().split(' ')
72 yield (fields[0], int(fields[1]), tuple(fields[2:]))
73
75 d = {}
76 for word, num, pron in raw(files):
77 if num == 1:
78 d[word] = (pron,)
79 else:
80 d[word] += (pron,)
81 return d
82
84 from nltk_lite.corpora import cmudict
85 from itertools import islice
86
87 print "raw method:"
88 for entry in islice(cmudict.raw(), 40000, 40025):
89 print entry
90 print
91
92 print "dictionary method:"
93 cmudict = cmudict.dictionary()
94 print 'NATURAL', cmudict['NATURAL']
95 print 'LANGUAGE', cmudict['LANGUAGE']
96 print 'TOOL', cmudict['TOOL']
97 print 'KIT', cmudict['KIT']
98
99 if __name__ == '__main__':
100 demo()
101
| Home | Trees | Indices | Help |
|
|---|
| Generated by Epydoc 3.0beta1 on Wed May 16 22:47:49 2007 | http://epydoc.sourceforge.net |