This example builds on the text mining chapter from Modern Data Science with R: http://mdsr-book.github.io/.
library(mdsr)
library(tidyr)
library(tm)
library(wordcloud)
data(Macbeth_raw)
# strsplit returns a list: we only want the first element
macbeth <- strsplit(Macbeth_raw, "\r\n")[[1]]
length(macbeth)
## [1] 3193
head(macbeth)
## [1] "This Etext file is presented by Project Gutenberg, in"
## [2] "cooperation with World Library, Inc., from their Library of the"
## [3] "Future and Shakespeare CDROMS. Project Gutenberg often releases"
## [4] "Etexts that are NOT placed in the Public Domain!!"
## [5] ""
## [6] "*This Etext has certain copyright implications you should read!*"
macbeth[300:310]
## [1] "meeting a bleeding Sergeant."
## [2] ""
## [3] " DUNCAN. What bloody man is that? He can report,"
## [4] " As seemeth by his plight, of the revolt"
## [5] " The newest state."
## [6] " MALCOLM. This is the sergeant"
## [7] " Who like a good and hardy soldier fought"
## [8] " 'Gainst my captivity. Hail, brave friend!"
## [9] " Say to the King the knowledge of the broil"
## [10] " As thou didst leave it."
## [11] " SERGEANT. Doubtful it stood,"
grep()
The grep()
function works using a needle in a haystack paradigm, wherein the first argument is the regular expression (or pattern) you want to find (i.e., the needle) and the second argument is the character vector in which you want to find patterns (i.e., the haystack). Note that unless the argument value is set to TRUE, grep()
returns the indices of the haystack in which the needles were found.
macbeth_lines <- grep(" MACBETH", macbeth, value = TRUE)
length(macbeth_lines)
## [1] 147
head(macbeth_lines)
## [1] " MACBETH, Thane of Glamis and Cawdor, a general in the King's"
## [2] " MACBETH. So foul and fair a day I have not seen."
## [3] " MACBETH. Speak, if you can. What are you?"
## [4] " MACBETH. Stay, you imperfect speakers, tell me more."
## [5] " MACBETH. Into the air, and what seem'd corporal melted"
## [6] " MACBETH. Your children shall be kings."
length(grep(" MACDUFF", macbeth))
## [1] 60
The grepl
function uses the same syntax but returns a logical vector as long as the haystack. Thus, while the length of the vector returned by grep
is the number of matches, the length of the vector returned by grepl
is always the same as the length of the haystack vector.
length(grep(" MACBETH", macbeth))
## [1] 147
length(grepl(" MACBETH", macbeth))
## [1] 3193
However, both will subset the original vector in the same way, and thus in this respect they are functionally equivalent.
identical(macbeth[grep(" MACBETH", macbeth)],
macbeth[grepl(" MACBETH", macbeth)])
## [1] TRUE
To extract the piece of each matching line that actually matched, use the str_extract()
function from the stringr
package.
library(stringr)
pattern <- " MACBETH"
grep(pattern, macbeth, value = TRUE) %>%
str_extract(pattern) %>%
head()
## [1] " MACBETH" " MACBETH" " MACBETH" " MACBETH" " MACBETH" " MACBETH"
head(grep("MAC.", macbeth, value = TRUE))
## [1] "MACHINE READABLE COPIES MAY BE DISTRIBUTED SO LONG AS SUCH COPIES"
## [2] "MACHINE READABLE COPIES OF THIS ETEXT, SO LONG AS SUCH COPIES"
## [3] "WITH PERMISSION. ELECTRONIC AND MACHINE READABLE COPIES MAY BE"
## [4] "THE TRAGEDY OF MACBETH"
## [5] " MACBETH, Thane of Glamis and Cawdor, a general in the King's"
## [6] " LADY MACBETH, his wife"
head(grep("MACBETH\\.", macbeth, value = TRUE))
## [1] " MACBETH. So foul and fair a day I have not seen."
## [2] " MACBETH. Speak, if you can. What are you?"
## [3] " MACBETH. Stay, you imperfect speakers, tell me more."
## [4] " MACBETH. Into the air, and what seem'd corporal melted"
## [5] " MACBETH. Your children shall be kings."
## [6] " MACBETH. And Thane of Cawdor too. Went it not so?"
head(grep("MAC[B-Z]", macbeth, value = TRUE))
## [1] "MACHINE READABLE COPIES MAY BE DISTRIBUTED SO LONG AS SUCH COPIES"
## [2] "MACHINE READABLE COPIES OF THIS ETEXT, SO LONG AS SUCH COPIES"
## [3] "WITH PERMISSION. ELECTRONIC AND MACHINE READABLE COPIES MAY BE"
## [4] "THE TRAGEDY OF MACBETH"
## [5] " MACBETH, Thane of Glamis and Cawdor, a general in the King's"
## [6] " LADY MACBETH, his wife"
head(grep("MAC(B|D)", macbeth, value = TRUE))
## [1] "THE TRAGEDY OF MACBETH"
## [2] " MACBETH, Thane of Glamis and Cawdor, a general in the King's"
## [3] " LADY MACBETH, his wife"
## [4] " MACDUFF, Thane of Fife, a nobleman of Scotland"
## [5] " LADY MACDUFF, his wife"
## [6] " MACBETH. So foul and fair a day I have not seen."
head(grep("^ MAC[B-Z]", macbeth, value = TRUE))
## [1] " MACBETH, Thane of Glamis and Cawdor, a general in the King's"
## [2] " MACDUFF, Thane of Fife, a nobleman of Scotland"
## [3] " MACBETH. So foul and fair a day I have not seen."
## [4] " MACBETH. Speak, if you can. What are you?"
## [5] " MACBETH. Stay, you imperfect speakers, tell me more."
## [6] " MACBETH. Into the air, and what seem'd corporal melted"
head(grep("^ ?MAC[B-Z]", macbeth, value = TRUE))
## [1] "MACHINE READABLE COPIES MAY BE DISTRIBUTED SO LONG AS SUCH COPIES"
## [2] "MACHINE READABLE COPIES OF THIS ETEXT, SO LONG AS SUCH COPIES"
head(grep("^ *MAC[B-Z]", macbeth, value = TRUE))
## [1] "MACHINE READABLE COPIES MAY BE DISTRIBUTED SO LONG AS SUCH COPIES"
## [2] "MACHINE READABLE COPIES OF THIS ETEXT, SO LONG AS SUCH COPIES"
## [3] " MACBETH, Thane of Glamis and Cawdor, a general in the King's"
## [4] " MACDUFF, Thane of Fife, a nobleman of Scotland"
## [5] " MACBETH. So foul and fair a day I have not seen."
## [6] " MACBETH. Speak, if you can. What are you?"
head(grep("^ +MAC[B-Z]", macbeth, value = TRUE))
## [1] " MACBETH, Thane of Glamis and Cawdor, a general in the King's"
## [2] " MACDUFF, Thane of Fife, a nobleman of Scotland"
## [3] " MACBETH. So foul and fair a day I have not seen."
## [4] " MACBETH. Speak, if you can. What are you?"
## [5] " MACBETH. Stay, you imperfect speakers, tell me more."
## [6] " MACBETH. Into the air, and what seem'd corporal melted"
We might learn something about the play by knowing when each character speaks as a function of the line number in the play. We can retrieve this information using grepl()
.
Macbeth <- grepl(" MACBETH\\.", macbeth)
LadyMacbeth <- grepl(" LADY MACBETH\\.", macbeth)
Banquo <- grepl(" BANQUO\\.", macbeth)
Duncan <- grepl(" DUNCAN\\.", macbeth)
speaker_freq <- data.frame(Macbeth, LadyMacbeth, Banquo, Duncan) %>%
mutate(line = 1:length(macbeth)) %>%
gather(key = "character", value = "speak", -line) %>%
mutate(speak = as.numeric(speak)) %>%
filter(line > 218 & line < 3172)
glimpse(speaker_freq)
## Observations: 11,812
## Variables: 3
## $ line <int> 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 22...
## $ character <chr> "Macbeth", "Macbeth", "Macbeth", "Macbeth", "Macbeth...
## $ speak <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
Before we create the plot, we will gather some helpful contextual information about when each Act begins.
acts_idx <- grep("^ACT [I|V]+", macbeth)
acts_labels <- str_extract(macbeth[acts_idx], "^ACT [I|V]+")
acts <- data.frame(line = acts_idx, labels = acts_labels)
ggplot(data = speaker_freq, aes(x = line, y = speak)) +
geom_smooth(aes(color = character), method = "loess", se = 0, span = 0.4) +
geom_vline(xintercept = acts_idx, color = "darkgray", lty = 3) +
geom_text(data = acts, aes(y = 0.085, label = labels),
hjust = "left", color = "darkgray") +
ylim(c(0, NA)) + xlab("Line Number") + ylab("Proportion of Speeches")
## Warning: Removed 36 rows containing missing values (geom_smooth).
Corpus <- VCorpus(VectorSource(macbeth))
sampleline <- 300
Corpus[[sampleline]] %>%
as.character() %>%
strwrap()
## [1] "meeting a bleeding Sergeant."
Corpus <- Corpus %>%
tm_map(stripWhitespace) %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeWords, stopwords("english"))
strwrap(as.character(Corpus[[sampleline]]))
## [1] "meeting bleeding sergeant"
wordcloud(Corpus, max.words = 30, scale = c(8, 1),
colors = topo.colors(n = 30), random.color = TRUE)
DTM <- DocumentTermMatrix(Corpus, control = list(weighting = weightTfIdf))
# DTM
findFreqTerms(DTM, lowfreq = 50)
## [1] "banquo" "come" "doctor" "done" "duncan"
## [6] "enter" "exeunt" "fear" "first" "good"
## [11] "hath" "ill" "know" "lady" "lennox"
## [16] "let" "like" "lord" "macbeth" "macduff"
## [21] "make" "malcolm" "man" "may" "murtherer"
## [26] "must" "now" "one" "ross" "say"
## [31] "scene" "see" "shall" "sir" "son"
## [36] "speak" "thee" "thou" "thy" "time"
## [41] "tis" "upon" "well" "will" "witch"
## [46] "yet"
DTM %>% as.matrix() %>%
apply(MARGIN = 2, sum) %>%
sort(decreasing = TRUE) %>%
head(9)
## macbeth macduff scene enter lady exeunt banquo
## 270.44988 153.76644 152.63294 121.37315 116.31969 114.55579 102.37784
## shall thou
## 93.89130 93.55111
Other useful resources include the CRAN Task View on Natural Language processing (https://cran.r-project.org/web/views/NaturalLanguageProcessing.html), the tm package (https://cran.r-project.org/web/packages/tm/index.html), the tidytext package (https://cran.r-project.org/web/packages/tidytext/index.html).