Skip to main content

Posts

Showing posts with the label Python NTLK Tokenize

Python NTLK Tokenize

def load_file(filename='text.txt'): """ Reads all text in filename, returns the following triplet: - list of all words - sentences (ordered list of words, per sentence) - POS-tags (ordered list of tags, per sentence) - chunks """ def process_raw_text(text): from nltk.tokenize import sent_tokenize, word_tokenize from nltk.tag import pos_tag, map_tag from itertools import chain from pattern.en import parsetree flatten = lambda x : list(chain(*x)) simplify_tag = lambda t : map_tag('en-ptb', 'universal', t) text = text.decode("utf8") chunks = [ [ c.type for c in t.chunks ] for t in parsetree(text) ] sentences = sent_tokenize(text) sentences = [ word_tokenize(s) for s in sentences ] sentences_tags = [ [ (w, simplify_tag(t)) for w, t in pos_tag(s) ] for s in sentences ] sentences = [ [ w for w, _ in s] for s in sentences_tags ] tags = [ [ t for _, t in s] for s in sentences_tags ] …