We begin our analysis of "Walden" by computing a list of all words appearing in the text, together with the number of occurrences of each word:

In [3]:
walden = sorted_list('walden.txt')
print(walden)
[('the', 6249), ('of', 3157), ('a', 2616), ('to', 2557), ('and', 2370), ('in', 1677), ('i', 1191), ('it', 1158), ('is', 1118), ('that', 1022), ('not', 825), ('was', 804), ('as', 772), ('with', 681), ('my', 673), ('for', 658), ('on', 633), ('be', 630), ('his', 623), ('have', 588), ('which', 575), ('by', 573), ('are', 545), ('he', 538), ('at', 521), ('had', 499), ('or', 490), ('they', 443), ('from', 435), ('this', 433), ('their', 431), ('one', 402), ('all', 398), ('were', 356), ('so', 343), ('its', 331), ('an', 323), ('more', 317), ('we', 315), ('me', 307), ('them', 305), ('some', 296), ('if', 291), ('would', 284), ('than', 279), ('our', 266), ('there', 265), ('man', 265), ('him', 247), ('you', 245), ('will', 233), ('only', 230), ('no', 221), ('when', 220), ('out', 219), ('like', 215), ('up', 212), ('who', 212), ('but', 212), ('has', 206), ('pond', 200), ('house', 197), ('any', 194), ('life', 193), ('day', 187), ('may', 184), ('what', 179), ('over', 178), ('most', 176), ('into', 176), ('men', 175), ('water', 173), ('do', 168), ('time', 163), ('could', 162), ('never', 158), ('been', 158), ('such', 155), ('other', 151), ('woods', 150), ('should', 149), ('still', 148), ('can', 147), ('many', 145), ('long', 141), ('about', 140), ('much', 137), ('well', 126), ('see', 126), ('without', 126), ('through', 124), ('these', 124), ('made', 122), ('ice', 120), ('new', 119), ('very', 117), ('did', 115), ('first', 114), ('two', 110), ('down', 108)]

 ... hundreds of additional lines of output omitted here...

[('britain', 1), ('crumbled', 1), ('tremont', 1), ('margin', 1), ('ending', 1), ('lapsing', 1), ('sweeter', 1), ('extremely', 1), ('suckers', 1), ('silicious', 1), ('withe', 1), ('dilute', 1), ('hyades', 1), ('manifested', 1), ('dined', 1), ('thanked', 1), ('overcame', 1), ('vexed', 1), ('hummed', 1), ('carbuncles', 1), ('inveteracies', 1), ('uncultivated', 1), ('sonorously', 1), ('viii', 1), ('owned', 1), ('whittled', 1), ('critical', 1), ('stroll', 1), ('treacherously', 1), ('shelves', 1), ('sewing', 1), ('ricochet', 1), ('avail', 1), ('expert', 1), ('neutral', 1), ('undeveloped', 1), ('nocumentum', 1), ('operative', 1), ('convicted', 1), ('amphitheatre', 1), ('durum', 1), ('destination', 1), ('distracted', 1), ('particulars', 1), ('bone', 1), ('apios', 1), ('wounded', 1), ('conduces', 1), ('guess', 1), ('whetted', 1), ('purgative', 1), ('designed', 1), ('astrologically', 1), ('shivered', 1), ('smelling', 1), ('burying', 1), ('futurity', 1), ("irishman's", 1), ('headedness', 1), ('devoting', 1), ('cruise', 1), ('dissipates', 1), ('normal', 1), ("hydra's", 1), ('angels', 1), ('elizabeth', 1), ('satisfactory', 1), ('reels', 1), ('starch', 1), ('manured', 1), ('loiter', 1), ('fragment', 1), ('tunnel', 1), ('newfoundland', 1), ('descended', 1), ('invites', 1), ('predict', 1), ('rending', 1), ('grasses', 1), ('conceiving', 1), ('penetrates', 1), ('barter', 1), ('heave', 1), ('consume', 1), ('che', 1), ('evergreen', 1), ('motto', 1), ('raccoon', 1), ('smiling', 1), ('unable', 1), ('undoubtedly', 1), ('couch', 1), ('rind', 1), ('ticking', 1), ('available', 1), ('eyelids', 1), ('stakes', 1), ('lea', 1), ('disregard', 1), ('woodpile', 1)]