pyconll

Code Sample


                        # Make sure 'Lord of the Rings' has 'Lord' and 'Rings' annotated as regular NOUN.
                        import pyconll
                        import pyconll.util

                        UD_ENGLISH_TRAIN = './ud/train.conll'
                        NGRAM = 'Lord of the Rings'.split()

                        train = pyconll.load_from_file(UD_ENGLISH_TRAIN)

                        # util#find_ngrams provides an iterator of tuples where the first element is the
                        # sentence where the ngram was found,the second element is the numeric index of
                        # the ngram occurence in the sentence, and the third element is a list of the
                        # matching ngrams.
                        for sentence, i, tokens in pyconll.util.find_ngrams(train, NGRAM):
                            tokens[0].upos = 'NOUN'
                            tokens[-1].upos = 'NOUN'

                        # After making sure these occurrences are properly handled, go through the
                        # corpus and determine how many unique proper nouns are annotated.
                        pronouns = set()
                        for sentence in train:
                            for token in sentence:
                                if token.upos == 'PROPN':
                                    pronouns.add(token.lemma)

                        # Print out how many unique pronouns there are in the corpus and print out the transformed version
                        print(len(pronouns))

                        # The argument to write should be a Writeable (i.e. have a write method). In this case we want to
                        # make sure we write to a utf-8 file.
                        with open('./ud/transformed.conll', 'w', encoding='utf-8') as f:
                            train.write(f)
pyconll

A minimal, all python, no dependency library to parse CoNLL files.

Code Sample

Install