# Recommendation: # Save this file to the same folder as the download_data.R # Use the following menu item: # Session > Set Working Directory > To Source File Location # to set the working directory to the same directory as before # this ensures the database file (if existent) or the text files # will be loaded correctly library(quanteda) library(quanteda.textstats) library(quanteda.textplots) db_file <- "data.RData" if(file.exists(db_file)){ load(db_file) }else{ library(readtext) raw <- readtext("./data/*.txt", docvarsfrom = "filename", docvarnames = c("gid","prename","name"), dvsep = "_") corp <- corpus(raw) metadata <- read.csv("./data/metadata.tsv", sep="\t") indices <- match(docid(corp), metadata$did) sapply(colnames(metadata)[-1], function(col){ docvars(corp, field=col) <<- metadata[indices,col] }) save(file=db_file, corp, raw) } corp$physics <- ifelse(corp$profession=="physics", "physics", "non-physics") dfmat <- tokens(corp, remove_punct = T, remove_symbols = T)|> tokens_remove(pattern=stopwords("en")) |> dfm() tfreq <- textstat_frequency(dfmat) head(tfreq) tail(tfreq) tfreqphy <- textstat_frequency(dfmat, groups = dfmat$physics) head(tfreqphy) head(tfreqphy[tfreqphy$group == "physics"]) head(tfreqphy[tfreqphy$group == "non-physics"]) tkeyphy <- textstat_keyness(dfmat, target=dfmat$physics =="physics") textplot_keyness(tkeyphy)