In this assignment you will…
tidyverse
and tidytext
.library(tidyverse)
library(tidytext)
main_text_fun
that takes as input the filename (incl. path) and returns the main_text
object from previous assignments containing the lower-case book text.# function extracting main text
main_text_fun = function(file){
# load text
text = read_file(file)
# define regex
regex = '\\*{3}[:print:]*\\*{3}'
# cut text into sections
text_split = str_split(text, '\\*{3}[:print:]*\\*{3}')
# get sections
sections = text_split[[1]]
# select main text
main_text = sections[2]
# out
main_text
}
list.files()
with full.names = TRUE
.# file
files = list.files('books', full.names = T)
main_text_fun()
within a sapply()
to extract the texts for all books and store them in an object called texts
.# process texts
texts <- sapply(files, main_text_fun)
tibble
using tibble
that has two columns: a column called book
containing a self-defined character vector of book names and a column called text
containing the texts
vector of books. Call the tibble text_tbl
. When you subsequently print it it should like the output shown below.# as tibble
text_tbl = tibble(book = c('Alice in Wonderland','Dorian Gray',
'Huckleberry Finn', 'Peter Pan', 'Treasure Island'),
text = texts)
# print
text_tbl
## # A tibble: 5 x 2
## book text
## <chr> <chr>
## 1 Alice in Wonder… "\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\nALICE’S ADVENTU…
## 2 Dorian Gray "\r\n\r\n\r\n\r\n\r\nProduced by Judith Boss. HTML version …
## 3 Huckleberry Finn "\r\n\r\nProduced by David Widger\r\n\r\n\r\n\r\n\r\n\r\nADV…
## 4 Peter Pan "\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\nPETER PAN\r\n\r…
## 5 Treasure Island "\r\n\r\n\r\n\r\n\r\nProduced by Judy Boss, John Hamm and Da…
unnest_tokens()
to token_tbl
in order to tokenize the elements in the text
column. See ?unnest_tokens
. Store the result back in token_tbl
.# tokenize
token_tbl = text_tbl %>%
unnest_tokens(word, text)
tidyverse
’s group_by()
and mutate()
idiom to add columns pos = 1:n()
and rel_pos = (pos-1)/max(pos-1)
coding the absolute and relative position of a word within the respective book. To do this, you must group according to the variable indicating the book.# add pos variable
token_tbl <- token_tbl %>%
group_by(book) %>%
mutate(pos = 1:n(),
rel_pos = (pos-1)/max(pos-1)) %>%
ungroup()
Extract the afinn sentiment dictionary using the get_sentiments()
function and store it in an object called afinn
.
Use inner_join
to combine your token_tbl
with afinn
. You are reading for analysis.
# add sentiments
token_tbl <- token_tbl %>%
inner_join(get_sentiments("afinn"))
group_by()
- mutate()
idiom along with the smooth()
function shown below in order to calculate smoothed sentiment values that a bit easier to interpret.# smoothing function
smooth = function(pos, value){
sm = sapply(pos, function(x) {
weights = dnorm(pos, x, max(pos) / 10)
sum(value * (weights / sum(weights)))
})
}
# smooth scores
token_tbl <- token_tbl %>%
group_by(book) %>%
mutate(smooth_value = smooth(pos, value))
10
) in the dnorm()
function within smooth()
.# plot sentiment arcs
ggplot(token_tbl,
aes(rel_pos, smooth_value,color=book)) +
geom_line(lwd=2) +
labs(x = "Position", y = 'Sentiment') +
theme_minimal()