pyconll provides a basic, low level wrapper around the CoNLL annotation scheme. pyconll's sensible API allows for easy automation and little ramp up time, and it works as a great building block for creating larger CL systems.
# Make sure 'Lord of the Rings' has 'Lord' and 'Rings' annotated as regular NOUN. import pyconll import pyconll.util UD_ENGLISH_TRAIN = './ud/train.conll' NGRAM = 'Lord of the Rings'.split() train = pyconll.load_from_file(UD_ENGLISH_TRAIN) # util#find_ngrams provides an iterator of tuples where the first element is the # sentence where the ngram was found,the second element is the numeric index of # the ngram occurence in the sentence, and the third element is a list of the # matching ngrams. for sentence, i, tokens in pyconll.util.find_ngrams(train, NGRAM): tokens.UPOS = 'NOUN' tokens[-1].UPOS = 'NOUN' # After making sure these occurrences are properly handled, go through the # corpus and determine how many unique proper nouns are annotated. pronouns = set() for sentence in train: for token in sentence: if token.upos == 'PROPN': pronouns.add(token.lemma) print(len(prounouns))
pip install pyconll