| Home | Trees | Indices | Help |
|
|---|
|
|
1 # Natural Language Toolkit: CONLL Corpus Reader
2 #
3 # Copyright (C) 2001-2007 University of Pennsylvania
4 # Author: Steven Bird <sb@ldc.upenn.edu>
5 # Edward Loper <edloper@gradient.cis.upenn.edu>
6 # URL: <http://nltk.sf.net>
7 # For license information, see LICENSE.TXT
8
9 """
10 Read chunk structures from the CONLL-2000 Corpus
11 """
12
13 from nltk_lite.corpora import get_basedir
14 from nltk_lite import tokenize, chunk
15 from nltk_lite.parse import tree
16 import os
17
18 items = ['train', 'test']
19
20 item_name = {
21 'train': 'training set',
22 'test': 'test set'
23 }
24
27
29 if type(files) is str: files = (files,)
30 for file in files:
31 path = os.path.join(get_basedir(), "conll2000", file + ".txt")
32 s = open(path).read()
33 for sent in tokenize.blankline(s):
34 yield [word for (word, tag, chunk) in _list_sent(sent)]
35
37 if type(files) is str: files = (files,)
38 for file in files:
39 path = os.path.join(get_basedir(), "conll2000", file + ".txt")
40 s = open(path).read()
41 for sent in tokenize.blankline(s):
42 yield [(word, tag) for (word, tag, chunk) in _list_sent(sent)]
43
45 if type(files) is str: files = (files,)
46 for file in files:
47 path = os.path.join(get_basedir(), "conll2000", file + ".txt")
48 s = open(path).read()
49 for sent in tokenize.blankline(s):
50 yield chunk.conllstr2tree(sent, chunk_types)
51
53 from nltk_lite.corpora import conll2000
54 from itertools import islice
55
56 print "CONLL Chunked data\n"
57
58 print "Raw text:"
59 for sent in islice(conll2000.raw(), 0, 5):
60 print sent
61 print
62
63 print "Tagged text:"
64 for sent in islice(conll2000.tagged(), 0, 5):
65 print sent
66 print
67
68 print "Chunked text:"
69 for tree in islice(conll2000.chunked(chunk_types=('NP','PP')), 0, 5):
70 print tree.pp()
71 print
72
73
74 if __name__ == '__main__':
75 demo()
76
| Home | Trees | Indices | Help |
|
|---|
| Generated by Epydoc 3.0beta1 on Wed May 16 22:47:52 2007 | http://epydoc.sourceforge.net |