def load_file(filename='text.txt'):
"""
Reads all text in filename, returns the following triplet:
- list of all words
- sentences (ordered list of words, per sentence)
- POS-tags (ordered list of tags, per sentence)
- chunks
"""
def process_raw_text(text):
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tag import pos_tag, map_tag
from itertools import chain
from pattern.en import parsetree
flatten = lambda x : list(chain(*x))
simplify_tag = lambda t : map_tag('en-ptb', 'universal', t)
text = text.decode("utf8")
chunks = [ [ c.type for c in t.chunks ] for t in parsetree(text) ]
sentences = sent_tokenize(text)
sentences = [ word_tokenize(s) for s in sentences ]
sentences_tags = [ [ (w, simplify_tag(t)) for w, t in pos_tag(s) ] for s in sentences ]
sentences = [ [ w for w, _ in s] for s in sentences_tags ]
tags = [ [ t for _, t in s] for s in sentences_tags ]
words = flatten(sentences)
return words, sentences, tags, chunks
f = open(filename,'r')
c = "".join([ x + " " for x in f.readlines() ])
f.close()
# Remove breaks and tabs
for char in ["\t", "\n"]:
c = c.replace(char, " ")
c = c.replace('."', '".')
c = c.replace(".'", "'.")
# Split special characters from words
for char in ["'", '"', ",", ".", "?", "!", ";", ":"]:
c = c.replace(char, " " + char + " ")
# Magic to remove all multi-spaces
return process_raw_text(' '.join(c.split()))