pyconll provides a basic, low level wrapper around the CoNLL annotation scheme. pyconll's sensible API allows for easy automation and little ramp up time, and it works as a great building block for creating larger CL systems.
# Make sure 'Lord of the Rings' has 'Lord' and 'Rings' annotated as regular NOUN. import pyconll import pyconll.util UD_ENGLISH_TRAIN = './ud/train.conll' NGRAM = 'Lord of the Rings'.split() train = pyconll.load_from_file(UD_ENGLISH_TRAIN) # util#find_ngrams provides an iterator of tuples where the first element is the # sentence where the ngram was found,the second element is the numeric index of # the ngram occurence in the sentence, and the third element is a list of the # matching ngrams. for sentence, i, tokens in pyconll.util.find_ngrams(train, NGRAM): tokens.upos = 'NOUN' tokens[-1].upos = 'NOUN' # After making sure these occurrences are properly handled, go through the # corpus and determine how many unique proper nouns are annotated. pronouns = set() for sentence in train: for token in sentence: if token.upos == 'PROPN': pronouns.add(token.lemma) # Print out how many unique pronouns there are in the corpus and print out the transformed version print(len(pronouns)) # The argument to write should be a Writeable (i.e. have a write method). In this case we want to # make sure we write to a utf-8 file. with open('./ud/transformed.conll', 'w', encoding='utf-8') as f: train.write(f)
pip install pyconll