In Gathering text from Project Gutenberg we retrieved text from Project Gutenberg and built a couple of functions along the way to help. The functions make us more efficient, but what we really need is a class for this work. This will allow us to store and analyze many different texts very efficiently. Let’s build that.

Let’s make a class called Book that will store the methods and variables we need. Each book will have a title, a URL (from Project Gutenberg), a position where the text actually starts, and a position where the text actually ends. We’ll also want to hold the raw text, the tokens, and the parts of speech within the class. We’ll build methods within the class to determine each of these.

from urllib import request
import os
import re
import nltk

stopwords = set(nltk.corpus.stopwords.words("english"))


class Book:
    def __init__(self, title, author, url):
        """
        Creates an instance of a book with a title, author, and URL of the text
        The URL must come from Project Gutenberg and should be the following format:
        http://www.gutenberg.org/cache/epub/84/pg84.txt
        
        start is the location of where the book actually begins in the text file
         (skips headers, publisher info, etc.)
        
        You can also define the end index to grab just a portion of the text
        """
        self.title = title
        self.author = author
        self.url = url
        self.full_raw_ = None
        self.start_ = None
        self.end_ = None
        self.raw_ = None
        self.tokens_ = None
        self.useful_words_ = None
        self.pos_ = None

    def __str__(self):
        return self.title

    @property
    def full_raw(self):
        """
        This function checks for text locally and if it doesn't find it
         the function collects it from Project Gutenberg
        """
        if self.full_raw_ is None:
            # First check if the file is stored locally
            fname = "corpora/canon_texts/" + self.title
            if os.path.isfile(fname):
                print("{title} file already exists".format(title=self.title))
                print("Extracting text from file")
                with open(fname, "r") as f:
                    full_raw = f.read()
            else:
                print(
                    "{title} does not already exist. Grabbing from Project Gutenberg".format(
                        title=self.title
                    )
                )
                response = request.urlopen(self.url)
                full_raw = response.read().decode("utf-8-sig")
                print("Now let's save it")
                with open(fname, "w") as outfile:
                    outfile.write(full_raw)
            self.full_raw_ = full_raw
        return self.full_raw_

    @property
    def start(self):
        """
        This function attempts to find the actual start of the book
        Most Gutenberg texts have a similar into that this skips
        """
        if self.start_ is None:
            start_regex = r"\*\*\*\s?START OF TH(IS|E) PROJECT GUTENBERG EBOOK.*\*\*\*"
            draft_start_position = re.search(start_regex, self.full_raw)
            self.start_ = draft_start_position.end()

            if re.search(
                self.title.lower(), self.full_raw[draft_start_position.end() :].lower()
            ):
                title_position = re.search(
                    self.title.lower(),
                    self.full_raw[draft_start_position.end() :].lower(),
                )
                self.start_ += title_position.end()
                # If the title is present, check for the author's name as well
                if re.search(
                    self.author.lower(),
                    self.full_raw[
                        draft_start_position.end() + title_position.end() :
                    ].lower(),
                ):
                    author_position = re.search(
                        self.author.lower(),
                        self.full_raw[
                            draft_start_position.end() + title_position.end() :
                        ].lower(),
                    )
                    self.start_ += author_position.end()
        return self.start_

    @property
    def end(self):
        """
        This function finds the end of the text.
        Most Project Gutenberg texts have additional material at the end that this skips
        """
        end_regex = "end of th(is|e) project gutenberg ebook"
        end_position = re.search(end_regex, self.full_raw.lower())
        end_position.start()

    @property
    def raw(self):
        """
        This function grabs the raw text
        """
        if self.raw_ is None:
            self.raw_ = self.full_raw[self.start : self.end]
        return self.raw_

    @property
    def tokens(self):
        """
        This function tokenizes the text using nltk.word_tokenizer
        """
        if self.tokens_ is None:
            self.tokens_ = nltk.word_tokenize(self.raw)
        return self.tokens_

    @property
    def useful_words(self):
        """
        Removes punctuation and stop words
        """
        if self.useful_words_ is None:
            letters_only = re.sub("[^a-zA-Z]", " ", self.raw)
            lower_word_list = letters_only.lower().split
            useful_words_ = [word for word in lower_word_list if word not in stopwords]
            # put back into a single block of text
        return useful_words_

    @property
    def pos(self):
        """
        Finds the parts of speech of a text
        """
        if self.pos_ is None:

            # Use NLTK's built-in tagger
            tagged = nltk.pos_tag(self.tokens)

            # Note that IN can be either a preposition or a conjunction, for now we're going to list it with the prepositions
            common_noun_pos = ["NN", "NNS"]
            common_nouns = []
            verb_pos = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]
            verbs = []
            adjective_pos = ["JJ", "JJR", "JJS"]
            adjectives = []
            pronoun_pos = ["PRP", "PRP$", "WP", "WP$"]
            pronouns = []
            adverb_pos = ["RB", "RBR", "RBS", "WRB"]
            adverbs = []
            proper_noun_pos = ["NNP", "NNPS"]
            proper_nouns = []
            conjunction_pos = ["CC"]
            conjunctions = []
            preposition_pos = ["IN", "TO"]
            prepositions = []
            interjection_pos = ["UH"]
            interjections = []
            modal_pos = [
                "MD"
            ]  # But these are also verbs, so let's make sure they show up as such
            modals = []
            tagged_other_pos = [
                "CD",
                "DT",
                "EX",
                "FW",
                "LS",
                "PDT",
                "POS",
                "RP",
                "SYM",
                "WDT",
            ]
            tagged_others = []
            other = []

            for _, token in enumerate(tagged):
                if token[1] in common_noun_pos:
                    common_nouns.append(token)
                elif token[1] in verb_pos:
                    verbs.append(token)
                elif token[1] in adjective_pos:
                    adjectives.append(token)
                elif token[1] in pronoun_pos:
                    pronouns.append(token)
                elif token[1] in adverb_pos:
                    adverbs.append(token)
                elif token[1] in proper_noun_pos:
                    proper_nouns.append(token)
                elif token[1] in conjunction_pos:
                    conjunctions.append(token)
                elif token[1] in preposition_pos:
                    prepositions.append(token)
                elif token[1] in interjection_pos:
                    interjections.append(token)
                elif token[1] in modal_pos:
                    modals.append(token)
                elif token[1] in tagged_other_pos:
                    tagged_others.append(token)
                else:
                    other.append(token)

            self.pos_ = [
                common_nouns,
                verbs,
                adjectives,
                pronouns,
                adverbs,
                proper_nouns,
                conjunctions,
                prepositions,
                interjections,
                modals,
            ]
        return self.pos_
        # Append modals to verbs
        # Create nouns that is both proper nouns and common nouns