pyconll

A minimal, all python, no dependency library to parse CoNLL files.

pyconll provides a basic, low level wrapper around the CoNLL annotation scheme. pyconll's sensible API allows for easy automation and little ramp up time, and it works as a great building block for creating larger CL systems.

Code Sample

# Make sure 'Lord of the Rings' has 'Lord' and 'Rings' annotated as regular NOUN.
import pyconll
import pyconll.util

UD_ENGLISH_TRAIN = './ud/train.conll'
NGRAM = 'Lord of the Rings'.split()

train = pyconll.load_from_file(UD_ENGLISH_TRAIN)

# util#find_ngrams provides an iterator of tuples where the first element is the
# sentence where the ngram was found,the second element is the numeric index of
# the ngram occurence in the sentence, and the third element is a list of the
# matching ngrams.
for sentence, i, tokens in pyconll.util.find_ngrams(train, NGRAM):
    tokens[0].upos = 'NOUN'
    tokens[-1].upos = 'NOUN'

# After making sure these occurrences are properly handled, go through the
# corpus and determine how many unique proper nouns are annotated.
pronouns = set()
for sentence in train:
    for token in sentence:
        if token.upos == 'PROPN':
            pronouns.add(token.lemma)

# Print out how many unique pronouns there are in the corpus and print out the transformed version
print(len(pronouns))

# The argument to write should be a Writeable (i.e. have a write method). In this case we want to
# make sure we write to a utf-8 file.
with open('./ud/transformed.conll', 'w', encoding='utf-8') as f:
    train.write(f)

Install

pip install pyconll

Want to find out more?