Code for Analyzing Pride and Prejudice

Ben Earle

01 Sep 2025 • 1 min read
import nltk
import re
import string
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.corpus import brown
brown_frequency_list = FreqDist(i.lower() for i in brown.words())

file=open("pride_and_prejudice_chapter_one.txt", "r")
pride_text=file.read()
from nltk.tokenize import word_tokenize
words=word_tokenize(pride_text)
print(len(words))

afile=open("annotated_pride.txt", "r")
mb=[]
msb=[]
pattern="\#character\{(?P<char_name>[A-Za-z_]*)\}"
for line in afile:
    match=re.search(pattern, line)
    if match:
        name=match.group("char_name")
        remainder=line[match.end():]
        if name=="mr_bennet":
            mb.append(remainder)
        else:
            msb.append(remainder)
mb_text="\n".join([l for l in mb if ("made no answer" not in l)])
msb_text="\n".join(msb)

def remove_punctuation(myList):
    return [w for w in myList if w not in (string.punctuation+"“”")]
    
common_words=[word for word,v in brown_frequency_list.most_common()[:2000]]

def filter_common_words(myList):
    return list(set([word.replace(".", "").lower() for word in myList if word.replace(".", "").lower() not in common_words]))    

def rare_words(myText):
    to_ret=filter_common_words(remove_punctuation(word_tokenize(myText)))
    to_ret.sort()
    return to_ret

mb_words=rare_words(mb_text)
msb_words=rare_words(msb_text)

dmb_words=[word for word in mb_words if word not in msb_words]
dmsb_words=[word for word in msb_words if word not in mb_words]
shared_words=[word for word in mb_words if word in msb_words]

print("Mr. Bennet's vocabulary")
print(", ".join(dmb_words))
print("Mrs. Bennet's vocabulary")
print(", ".join(dmsb_words))
print("Shared Vocabulary")
print(", ".join(shared_words))