pyconll provides a basic, low level wrapper around the CoNLL annotation scheme. pyconll's sensible API allows for easy automation and little ramp up time, and it works as a great building block for creating larger CL systems.
# Make sure 'Lord of the Rings' has 'Lord' and 'Rings' annotated as regular NOUN.
import pyconll
import pyconll.util
UD_ENGLISH_TRAIN = './ud/train.conll'
NGRAM = 'Lord of the Rings'.split()
train = pyconll.load_from_file(UD_ENGLISH_TRAIN)
# util#find_ngrams provides an iterator of tuples where the first element is the
# sentence where the ngram was found,the second element is the numeric index of
# the ngram occurence in the sentence, and the third element is a list of the
# matching ngrams.
for sentence, i, tokens in pyconll.util.find_ngrams(train, NGRAM):
tokens[0].upos = 'NOUN'
tokens[-1].upos = 'NOUN'
# After making sure these occurrences are properly handled, go through the
# corpus and determine how many unique proper nouns are annotated.
pronouns = set()
for sentence in train:
for token in sentence:
if token.upos == 'PROPN':
pronouns.add(token.lemma)
# Print out how many unique pronouns there are in the corpus and print out the transformed version
print(len(pronouns))
# The argument to write should be a Writeable (i.e. have a write method). In this case we want to
# make sure we write to a utf-8 file.
with open('./ud/transformed.conll', 'w', encoding='utf-8') as f:
train.write(f)
pip install pyconll
Want to find out more?