import nltk
import re
import string
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.corpus import brown
brown_frequency_list = FreqDist(i.lower() for i in brown.words())
file=open("pride_and_prejudice_chapter_one.txt", "r")
pride_text=file.read()
from nltk.tokenize import word_tokenize
words=word_tokenize(pride_text)
print(len(words))
afile=open("annotated_pride.txt", "r")
mb=[]
msb=[]
pattern="\#character\{(?P<char_name>[A-Za-z_]*)\}"
for line in afile:
match=re.search(pattern, line)
if match:
name=match.group("char_name")
remainder=line[match.end():]
if name=="mr_bennet":
mb.append(remainder)
else:
msb.append(remainder)
mb_text="\n".join([l for l in mb if ("made no answer" not in l)])
msb_text="\n".join(msb)
def remove_punctuation(myList):
return [w for w in myList if w not in (string.punctuation+"“”")]
common_words=[word for word,v in brown_frequency_list.most_common()[:2000]]
def filter_common_words(myList):
return list(set([word.replace(".", "").lower() for word in myList if word.replace(".", "").lower() not in common_words]))
def rare_words(myText):
to_ret=filter_common_words(remove_punctuation(word_tokenize(myText)))
to_ret.sort()
return to_ret
mb_words=rare_words(mb_text)
msb_words=rare_words(msb_text)
dmb_words=[word for word in mb_words if word not in msb_words]
dmsb_words=[word for word in msb_words if word not in mb_words]
shared_words=[word for word in mb_words if word in msb_words]
print("Mr. Bennet's vocabulary")
print(", ".join(dmb_words))
print("Mrs. Bennet's vocabulary")
print(", ".join(dmsb_words))
print("Shared Vocabulary")
print(", ".join(shared_words))