| Home | Trees | Indices | Help |
|
|---|
|
|
1 # Natural Language Toolkit: CONLL 2002 Corpus Reader
2 #
3 # Copyright (C) 2001-2007 University of Pennsylvania
4 # Author: Steven Bird <sb@ldc.upenn.edu>
5 # Edward Loper <edloper@gradient.cis.upenn.edu>
6 # Ewan Klein <ewan@inf.ed.ac.uk>
7 # URL: <http://nltk.sf.net>
8 # For license information, see LICENSE.TXT
9
10 """
11 Read Named Entity tagged data as chunk structures from the CONLL-2002 Corpus
12 """
13
14 from nltk_lite.corpora import get_basedir
15 from nltk_lite import tokenize, chunk
16 from nltk_lite.parse import tree
17 import os
18
19 esp = ['esp.train', 'esp.testa', 'esp.testb']
20 ned = ['ned.train', 'ned.testa', 'ned.testb']
21 items = esp + ned
22
23 item_name = {
24 'ned.train': 'Dutch training set',
25 'ned.testa': 'Dutch test set a',
26 'ned.testb': 'Dutch test set b',
27 'esp.train': 'Spanish training set',
28 'esp.testa': 'Spanish test set a',
29 'ned.testb': 'Spanish test set b',
30 }
31
34
36 if type(files) is str: files = (files,)
37 for file in files:
38 path = os.path.join(get_basedir(), "conll2002", file)
39 s = open(path).read()
40 # remove initial -DOCSTART- -DOCSTART- O
41 if s.startswith('-DOCSTART'):
42 s = s[23:]
43 for sent in tokenize.blankline(s):
44 yield [word for (word, tag, ner) in _list_sent(sent)]
45
47 if type(files) is str: files = (files,)
48 for file in files:
49 path = os.path.join(get_basedir(), "conll2002", file)
50 s = open(path).read()
51 # remove initial -DOCSTART- -DOCSTART- O
52 if s.startswith('-DOCSTART-'):
53 s = s[23:]
54 for sent in tokenize.blankline(s):
55 yield [(word, tag) for (word, tag, ner) in _list_sent(sent)]
56
58 """
59 MISC has been omitted
60 """
61 if type(files) is str: files = (files,)
62 for file in files:
63 path = os.path.join(get_basedir(), "conll2002", file)
64 s = open(path).read()
65 # remove initial -DOCSTART- -DOCSTART- O
66 if s.startswith('-DOCSTART'):
67 s = s[23:]
68 for sent in tokenize.blankline(s):
69 yield chunk.conllstr2tree(sent, chunk_types)
70
72 from nltk_lite.corpora import conll2002
73 from itertools import islice
74
75 print "CONLL2002 NE data\n"
76
77 print "Raw text -- Dutch:"
78 for sent in islice(conll2002.raw(files = ['ned.train']), 0, 5):
79 print sent
80 print
81
82 print "Raw text --Spanish:"
83 for sent in islice(conll2002.raw(files = ['esp.train']), 0, 5):
84 print sent
85 print
86
87 print "POS Tagged text -- Dutch:"
88 for sent in islice(conll2002.pos_tagged(files = ['ned.train']), 0, 5):
89 print sent
90 print
91
92 print "POS Tagged text --Spanish:"
93 for sent in islice(conll2002.pos_tagged(files = ['esp.train']), 0, 5):
94 print sent
95 print
96
97 print "Named Entity chunked text -- Dutch:"
98 for tree in islice(conll2002.ne_chunked(files = ['ned.train']), 0, 5):
99 print tree.pp()
100 print
101
102 print "Named Entity chunked text --Spanish:"
103 for tree in islice(conll2002.ne_chunked(files = ['esp.train']), 0, 5):
104 print tree.pp()
105 print
106
107
108 if __name__ == '__main__':
109 demo()
110
| Home | Trees | Indices | Help |
|
|---|
| Generated by Epydoc 3.0beta1 on Wed May 16 22:47:40 2007 | http://epydoc.sourceforge.net |