############### HOMEWORK 3, STAT 541, DUE FRI, OCT 02, 2009, 12 NOON ############### # YOUR NAME: ... # Instructions: # . Fill in your name above. Do not remove the header from this file. # . Insert your solutions below after each '# Answer:'. # . Return this file with your solutions by email (stat541.at.wharton@gmail.com) # in an attachment with filename in the following format: # YourLastname-YourFirstname-hw02.R # About this homework: # This is an exercise in manipulating character data. # The dataset is the dictionary of the English language borrowed # from Microsoft's spelling correction database. # Download the dataset 'dict.dat' from the class webpage # and read it into a vector 'dict': dict <- scan("dict.dat", what="", quo="") # Or read directly into R from the web: dict <- scan("http://www-stat.wharton.upenn.edu/~buja/STAT-541/dict.dat", what="", quo="") # (The named argument 'quo=...' makes sure that quotes in the dictionary # do not signal the start of a string # Hint: You will find the following functions useful. grep nchar paste sample sort strsplit substring table unlist #PROBLEM 0: sanity check # How many 'words' are in the dictionary? #ANSWER: length(dict) [1] 155414 ## Over 155,000 words. #PROBLEM 1: data cleaning # A dictionary is supposed to contain a word only once. # We should never make the obvious assumptions but actually # check them. Tasks: # a) Check whether there are words that appear more than once. # b) If so, remove them from 'dict' and check the length again. #ANSWER: ## We tabulate 'dict' and check whether there are counts >1: dict.tab <- table(dict) dict.tab[dict.tab > 1] odoriferous odoriferously odoriferousness odorimetries odorimetry 2 2 2 2 2 odoriphore 2 ## So there are duplicates! Curiously they are all of the "odor" kind. ## We can collect the duplicates ('dups') by peeling them off the names ## of the table, which is an associative array (vector with named elements): dups <- names(dict.tab[dict.tab > 1]) dups [1] "odoriferous" "odoriferously" "odoriferousness" "odorimetries" [5] "odorimetry" "odoriphore" ## Here is where the dups are: which(dict %in% dups) [1] 92907 92908 92909 92910 92911 92912 92913 92914 92915 92916 92917 92918 ## They are in a contiguous block: 92907:92918. ## Here they are: dict[dict %in% dups] [1] "odoriferous" "odoriferous" "odoriferously" "odoriferously" [5] "odoriferousness" "odoriferousness" "odorimetries" "odorimetries" [9] "odorimetry" "odorimetry" "odoriphore" "odoriphore" ## Turns out each word is duplicated in the next position. ## This is confirmed by sum(dict[-1]==dict[-length(dict)]) ## ## Another possibility of finding partial answers about duplicated values is: which(duplicated(dict)) [1] 92908 92910 92912 92914 92916 92918 ## Be aware that this does not report the first occurrence! ## ## Removing the dups is almost trivial and would not have required ## the above analysis: dict <- unique(dict) length(dict) [1] 155408 ## This is consistent: removed 6 dups ## If you generated 'dict.tab', then this works also: dict <- names(dict.tab) ## The following repeats the checking for dups: dict.tab <- table(dict) dict.tab[dict.tab > 1] ## which produces an empty vector; in other words, there are no dups left. #PROBLEM 2: # What are the 'words' that contain a single quote? #ANSWER: dict[grep("'",dict)] [1] "'s" "I'll" "I've" "ain't" "aren't" [6] "army's" "ass's" "b'rith" "bird'seye" "c'est" [11] "can't" "couldn't" "d'affaires" "d'arsonval" "d'etat" [16] "d'etre" "d'hote" "d'hotel" "d'oeuvre" "d'oeuvres" [21] "didn't" "doesn't" "don't" "don'ts" "dt's" [26] "entr'acte" "entr'actes" "hadn't" "hasn't" "haven't" [31] "he'd" "he'll" "he's" "isn't" "it'd" [36] "it'll" "it's" "l'etat" "li'l" "lloyd's" [41] "ma'am" "mayn't" "n'gana" "needn't" "nor'wester" [46] "nor'westers" "o'clock" "o'er" "o'neill" "od'd" [51] "od'ing" "ok'd" "ok'ing" "ok's" "shan't" [56] "she'd" "she'll" "she's" "shouldn't" "that'll" [61] "that's" "they'd" "they'll" "they're" "they've" [66] "tho'" "thro'" "wasn't" "we'd" "we'll" [71] "we're" "we've" "weren't" "who'd" "who'll" [76] "who's" "won't" "wouldn't" "you'd" "you'll" [81] "you're" "you've" ## Over 80 expressions, mostly contractions, seem to be needed in a dictionary ## to be useful for spell checking. ## (Comment: These expressions would have messed up 'scan()' if we ## had not used 'quo=""' above.) #PROBLEM 3: What is the longest word length in the dictionary? #ANSWER: # 'nchar()' gives the length of each word, hence this gives the longest word: max(nchar(dict)) [1] 31 # The longest word in this dictionary has 31 characters. #PROBLEM 4: Which are the words of maximal length and how many are there? # Do you recognize them? Why would Microsoft include such words? #ANSWER: dict[nchar(dict)==max(nchar(dict))] dict[nchar(dict)==31] # same but... [1] "dichlorodiphenyltrichloroethane" # There is only one... # The more common abbreviation for this chemical is 'DDT'. #PROBLEM 5: What does this here do? sum(nchar(dict) == 25) #ANSWER: # This code counts the number of words with length 25. [1] 10 #PROBLEM 6: Generate a frequency table that tells how many words # there are of each length. How many words are there of length 29? #ANSWER: table(nchar(dict)) 1 2 3 4 5 6 7 8 9 10 11 12 13 2 250 768 3683 7477 12651 18489 21803 21921 19803 16045 11896 8306 14 15 16 17 18 19 20 21 22 23 24 25 26 5252 3166 1725 988 507 297 165 97 51 28 19 10 4 27 28 31 3 1 1 # For example, there are three words of length 27, but none of length 29 and 30. #PROBLEM 7: Write a for-loop that prints for lengths 1 to 20 one randomly # picked word that has these lengths. Don't use 'print()'; use 'cat()' instead. #ANSWER: for(i in 1:20) cat(i, sample(dict[nchar(dict)==i], size=1), "\n") 1 I 2 rt 3 rah 4 koto 5 libra 6 tehran 7 fibular 8 gladding 9 secondary 10 phenologic 11 suspectable 12 inequalities 13 spinelessness 14 refractiveness n15 foresightedness 16 glossopharyngeal 17 retroperitoneally 18 supersophisticated 19 electroretinography 20 venoperitoneostomies ## Each call will generate a different output because 'sample()' picks randomly. #PROBLEM 8: # We are now going to find all single-word anagrams of the English # language. An anagram is a pair of words or phrases that use the # same set of characters (with repetitions) or, in other words, they # are character permutations of each other. For example, "emanates" # and "manatees" form an anagram because they have the same set of # letters:"aaeemnst". # In this problem we take the first step towards the goal of finding # all single-word anagrams by forming a new vector 'dict.srt' of the # same length as 'dict' that contains for each word in 'dict' the # sorted list of letters. That is, the entry 'manatees' in 'dict' # would have the corresponding entry 'aaeemnst' in 'dict.srt'. The # point of the vector is that if two words in 'dict' have the same # entries in 'dict.srt', they are anagrams of each other. # It might be good advice to choose any entry in 'dict' and develop # the necessary code for sorting the letters in it. Once it works, # simply loop over all entries of 'dict' and store the processed # result in the corresponding entry of 'dict.sr5'. Also, print a # message every 10000 words so you know the loop hasn't stalled. # Note: R doesn't have a character sorting function, but it has the # 'sort()' function that can sort not only number vectors numerically # but string vectors lexicographically as well. #ANSWER: ## We choose 'dict[1000]' which contains "abasia" to develop the code. ## Split into individual characters: strsplit(dict[1000],"") ## This is not a vector but a list consisting of one vector. Hence 'unlist': unlist(strsplit(dict[1000],"")) ## Now we can sort this vector of single characters: sort(unlist(strsplit(dict[1000],""))) ## Then reconcatenate into a single string: paste(sort(unlist(strsplit(dict[1000],""))), collapse="") ## Next we loop over 'dict' and apply this code to each entry, ## but first we allocate the vector 'dict.srt': dict.srt <- rep("",length(dict)) for(i in 1:length(dict)) { dict.srt[i] <- paste(sort(unlist(strsplit(dict[i],""))), collapse="") if(i%%10000==0) print(i) } ## Here is another solution, w/o print statement, ## which is a touch faster (15 vs. 23 sec on my laptop) ## and eliminates the need for 'unlist()': date(); dict.srt <- sapply(strsplit(dict, ""), function(x) paste(sort(x), collapse="")); date() ## The 'date()' wrapper allows you to measure elapsed time. ## You can wrap 'date()' around the above loop to confirm the speed difference. ## The sapply() function is like lapply() except it does unlist() when possible ## and it even returns matrices if the all items in the list are of the ## same basic type and same length. #PROBLEM 9: # Find the ten largest anagram sets, that is, the character sets from # which the most anagrams can be formed. # To go about the problem, create a table that tells for each # character set how many anagrams can be formed from it. # For example, the character set "cdeir" has 4 words from which to # form anagrams (they happend to be "cider" "cried" "dicer" "riced"); # the table should have a count of 4 associated with "cdeir". # Sort this table in decreasing order and call the result 'dict.srt.tab'. # Peeling off the first 10 elements gives you the 10 character sets from # which the most anagrams can be formed. Find a way to loop over these # 10 charactersets and list the anagram words for each. #ANSWER: ## Use the 'table()' function to tabulate the character sets in 'dict.srt'. ## Not only that, sort the counts in decreasing order: dict.srt.tab <- sort(table(dict.srt), decreasing=T) # (takes a few seconds) ## The 12 largest counts are: dict.srt.tab[1:10] aceprs aelst aeprs acerst acert aelrst aerst abest acenrt adeprs 9 9 9 8 8 8 8 7 7 7 ## So we have three anagram sets of size 9, four anagram sets of size 8,... ## Next we list the actual anagram sets: charsets <- names(dict.srt.tab)[1:10] for(crs in charsets) cat(crs,": ",dict[dict.srt==crs],"\n") aceprs : capers carpes casper escarp pacers parsec recaps scrape spacer aelst : least leats setal slate stale steal tales teals tesla aeprs : pares parse pears prase presa rapes reaps spare spear acerst : carets cartes caster caters crates reacts recast traces acert : caret carte cater crate creta react recta trace aelrst : alerts alters estral laster salter slater staler stelar aerst : aster rates resat satre stare tares tears teras abest : abets baste bates beast beats betas tabes acenrt : canter carnet centra cretan nectar recant trance adeprs : drapes padres parsed rasped spader spared spread ## Obviously not all are household words. The dictionary is used for ## spell checking, hence it has to include some rare words as well. ## Btw, in Problems 11-12 we will see that there are more charactersets ## with an anagram set of size 7. #PROBLEM 10: Continuation of PROBLEMs 8 and 9. # What does the following code generate? dict.srt.siz <- dict.srt.tab[dict.srt] #ANSWER: ## This vector is of the same length as 'dict.srt' and hence 'dict'. ## For each entry in 'dict.srt' (and 'dict') it gives the size of the ## anagram word set to which this word belongs. One less is the number ## of anagrams this word has. #PROBLEM 11: # How many words are there that have no anagram, 1 anagram, # 2 anagrams, 3 anagrams,...? ##ANSWER: ## The simplest is to tabulate 'dict.srt.siz', but keeping ## in mind that an anagram set size of 1 means there is no ## anagram: table(dict.srt.siz-1) 0 1 2 3 4 5 6 7 8 134173 14892 4044 1372 520 264 84 32 27 ## Thus 134,173 out of 155408 words (=length(dict)) have no anagrams, ## 14,892 words have exactly one anagram, 4,044 have exactly 2 anagrams,... #PROBLEM 12: # What does the output of the following code tell us? table(dict.srt.tab) # How does this output relate to the output of Problem 11? #ANSWER: ## The code generates the following output: 1 2 3 4 5 6 7 8 9 134173 7446 1348 343 104 44 12 4 3 ## It tells us that there are exactly 7,446 anagram sets of size 2, ## 1,348 anagram sets of size 3, 343 anagram sets of size 4,... ## The relation to Problem 11 is that, for example, 7,446 anagram sets ## of size 2 implies that there are 2*7446=14892 words with exactly ## 1 anagram, and 3*1348=4044 words with exactly 2 anagrams,... #PROBLEM 13: This is unconnected to the anagram exercise above. # It is a simple exercise in building a dataframe that collects # various characteristics of each word in 'dict'. In detail, # form a dataframe 'dict.dfr' with as many rows as there are elements # in 'dict' and columns that contain the following: # col 1: the lengths of the words (numeric) # col 2: whether the word starts with a capital letter (logical) # col 3: whether the word starts with a lower case vowel # ("a","e","i","y","o","u") # col 4: whether the word ends with a letter "s" # Hint: Check out the binary operation '%in%'. # Give the columns the names 'nchar', 'cap', 'l.vow', 'end.s'. # Turn 'dict.dfr' into an associative data structure by assigning 'dict' # as rownames so that the following allows you to retrieve the above # variables for each word of the dictionary: dict.dfr[c("and","Andy","sets","dichlorodiphenyltrichloroethane"),] #ANSWER: dict.dfr <- data.frame(nchar = nchar(dict), cap = (substring(dict,1,1) %in% LETTERS), l.vow = (substring(dict,1,1) %in% c("a","e","i","y","o","u")), end.s = (substring(dict,nchar(dict),nchar(dict))=="s") ) rownames(dict.dfr) <- dict ## The dataframe works as an associative array; dict.dfr[c("and","Andy","sets","dichlorodiphenyltrichloroethane"),] nchar cap l.vow end.s and 3 FALSE TRUE FALSE Andy 4 TRUE FALSE FALSE sets 4 FALSE FALSE TRUE dichlorodiphenyltrichloroethane 31 FALSE FALSE FALSE ################ ## ## Final note: special characters ## ## For those who noticed special characters in the words of the dictionary, ## yes, there are some, and you can track them down as follows: letters.dict <- sort(unique(unlist(strsplit(dict,"")))) letters.dict [1] "'" "a" "A" "b" "B" "c" "C" "ç" "d" "D" "e" "E" "é" "f" "F" "g" "G" "h" "H" [20] "i" "I" "j" "J" "k" "K" "l" "L" "m" "M" "n" "N" "ñ" "o" "O" "ô" "p" "P" "q" [39] "r" "R" "s" "S" "t" "T" "u" "U" "v" "V" "w" "W" "x" "X" "y" "Y" "z" "Z" letters.spec <- setdiff(letters.dict, c(letters,LETTERS)) letters.spec [1] "'" "ç" "é" "ñ" "ô" ## Here are the words in 'dict' with non-single-quote special characters: dict[grep(paste("[",paste(letters.spec[-1],collapse=""),"]",sep=""), dict)] [1] "café" "entrecôte" "entrecôtes" "entrée" "entrées" [6] "garçon" "mañana" "résumé" "señor" "señora" [11] "señoras" "señores" "señorita" "señoritas" "señors" ## Thus we have very few imports from French and Spanish that are ## written in their native spellings.