| Home | Trees | Indices | Help |
|
|---|
|
|
1 # Natural Language Toolkit - Discretise
2 # The command line entry point to discretisers
3 #
4 # Author: Sumukh Ghodke <sumukh dot ghodke at gmail dot com>
5 #
6 # URL: <http://nltk.sf.net>
7 # This software is distributed under GPL, for license information see LICENSE.TXT
8 from nltk_lite.contrib.classifier import split_ignore_space
9 from nltk_lite.contrib.classifier import instances as ins, discretisedattribute as da, cfile as f, numrange as r, format, commandline as cl
10 from nltk_lite.contrib.classifier.exceptions import filenotfounderror as fnf, invaliddataerror as inv
11 import sys
12
13 a_help = "Selects the discretisation algorithm " \
14 + "Options: UEW for Unsupervised Equal Width " \
15 + " UEF for Unsupervised Equal Frequency " \
16 + " NS for Naive Supervised " \
17 + " NS1 for Naive Supervised version 1 " \
18 + " NS2 for Naive Supervised version 2 " \
19 + " ES for Entropy Based Supervised " \
20 + "Default: UEW."
21
22 f_help = "Base name of attribute, klass, training, test and gold" \
23 + " files. "
24
25 t_help = "Base name of training file for discretisation. "
26
27 T_help = "Base name of test file to be discterised. "
28
29 g_help = "Base name of gold file to be discretised. "
30
31 A_help = "Comma separated list of attribute indices. "
32
33 o_help = "Algorithm specific options " \
34 + "UEW: Comma separated list of number of parts in which" \
35 + " each attribute should be split. "
36
37 UNSUPERVISED_EQUAL_WIDTH = 'UEW'
38 UNSUPERVISED_EQUAL_FREQUENCY = 'UEF'
39 NAIVE_SUPERVISED = 'NS'
40 NAIVE_SUPERVISED_V1 = 'NS1'
41 NAIVE_SUPERVISED_V2 = 'NS2'
42 ENTROPY_BASED_SUPERVISED = 'ES'
43
44 ALGORITHM_MAPPINGS = {UNSUPERVISED_EQUAL_WIDTH : 'unsupervised_equal_width', \
45 UNSUPERVISED_EQUAL_FREQUENCY : 'unsupervised_equal_frequency', \
46 NAIVE_SUPERVISED : 'naive_supervised', \
47 NAIVE_SUPERVISED_V1 : 'naive_supervised_v1', \
48 NAIVE_SUPERVISED_V2 : 'naive_supervised_v2', \
49 ENTROPY_BASED_SUPERVISED : 'entropy_based_supervised'}
50
51
54 cl.CommandLineInterface.__init__(self, ALGORITHM_MAPPINGS.keys(), UNSUPERVISED_EQUAL_WIDTH, a_help, f_help, t_help, T_help, g_help)
55 self.add_option("-A", "--attributes", dest="attributes", type="string", help=A_help)
56 self.add_option("-o", "--options", dest="options", type="string", help=o_help)
57
59 cl.CommandLineInterface.execute(self)
60 self.attributes_indices = self.get_value('attributes')
61 self.options = self.get_value('options')
62 self.validate_basic_arguments_are_present()
63 self.validate_files_arg_is_exclusive()
64
65 if not self.algorithm == NAIVE_SUPERVISED and self.options is None:
66 self.error("Invalid arguments. One or more required arguments are not present.")
67 self.discretise_and_write_to_file()
68
70 ignore_missing = False
71 #duplicate code and not tested!!
72 if self.files is not None:
73 self.training_path, self.test_path, self.gold_path = [self.files] * 3
74 ignore_missing = True
75 training, attributes, klass, test, gold = self.get_instances(self.training_path, self.test_path, self.gold_path, ignore_missing)
76 disc = Discretiser(training, attributes, klass, test, gold, cl.as_integers('Attribute indices', self.attributes_indices), cl.as_integers('Options', self.options))
77 getattr(disc, ALGORITHM_MAPPINGS[self.algorithm])()
78 files_written = self.write_to_file(self.get_suffix(), training, attributes, klass, test, gold)
79 print 'The following files were created with discretised values...'
80 for file_name in files_written:
81 print file_name
82
84 indices_str = ''
85 indices = self.attributes_indices.split(',')
86 for index in indices:
87 indices_str += '_' + str(index.strip())
88 return '-d' + indices_str
89
91 - def __init__(self, training, attributes, klass, test, gold, attribute_indices, options = None):
92 self.training, self.attributes, self.klass, self.test, self.gold = training, attributes, klass, test, gold
93 self.attribute_indices, self.options = attribute_indices, options
94 self.__validate_attribute_indices()
95 self.__validate_options()
96
97 self.subset = self.attributes.subset(self.attribute_indices)
98
100 if self.options is None: return
101 for option in self.options:
102 if option == 0:
103 raise inv.InvalidDataError('Option cannot be equal to zero.')
104
106 for index in self.attribute_indices:
107 if index < 0 or index >= len(self.attributes):
108 raise inv.InvalidDataError('Attribute indices should be between 0 and ' + str(len(self.attributes) - 1) + ' both inclusive, but found ' + str(index))
109
111 ranges = self.training.value_ranges(self.subset)
112 disc_attrs = self.discretised_attributes(ranges)
113 self.__discretise(disc_attrs)
114
116 self.training.discretise(disc_attrs)
117 if self.test is not None: self.test.discretise(disc_attrs)
118 if self.gold is not None: self.gold.discretise(disc_attrs)
119 self.attributes.discretise(disc_attrs)
120
122 values_array = self.training.values_grouped_by_attribute(self.subset)
123 disc_attrs = []
124 for index in range(len(self.subset)):
125 values = values_array[index]
126 values.sort()
127 attribute = self.subset[index]
128 ranges = ranges_from_chunks(get_chunks_with_frequency(values, self.options[index]))
129 disc_attrs.append(da.DiscretisedAttribute(attribute.name, ranges, attribute.index))
130 self.__discretise(disc_attrs)
131
134
136 self.__supervised_discretisation(lambda breakpoints, index: breakpoints.find_naive_v1(self.options[index]))
137
139 self.__supervised_discretisation(lambda breakpoints, index: breakpoints.find_naive_v2(self.options[index]))
140
142 self.__supervised_discretisation(lambda breakpoints, index: breakpoints.find_entropy_based_max_depth(self.options[index]))
143
145 disc_attrs = []
146 for index in range(len(self.subset)):
147 attribute = self.subset[index]
148 breakpoints = self.training.supervised_breakpoints(attribute)
149 action(breakpoints, index)
150 disc_attrs.append(da.DiscretisedAttribute(attribute.name, breakpoints.as_ranges(), attribute.index))
151 self.__discretise(disc_attrs)
152
154 discretised_attributes = []
155 for index in range(len(self.options)):
156 _range, width, attribute = ranges[index], self.options[index], self.subset[index]
157 discretised_attributes.append(da.DiscretisedAttribute(attribute.name, _range.split(width), attribute.index))
158 return discretised_attributes
159
169
176
177 if __name__ == "__main__":
178 Discretise().run(sys.argv[1:])
179
| Home | Trees | Indices | Help |
|
|---|
| Generated by Epydoc 3.0beta1 on Wed May 16 22:47:41 2007 | http://epydoc.sourceforge.net |