# Recommendation: # Save this file to the same folder as the download_data.R # Use the following menu item: # Session > Set Working Directory > To Source File Location # to set the working directory to the same directory as before # this ensures the database file (if existent) or the text files # will be loaded correctly library(quanteda) library(quanteda.textstats) db_file <- "data.RData" if(file.exists(db_file)){ load(db_file) }else{ library(readtext) raw <- readtext("./data/*.txt", docvarsfrom = "filename", docvarnames = c("gid","prename","name"), dvsep = "_") corp <- corpus(raw) metadata <- read.csv("./data/metadata.tsv", sep="\t") indices <- match(docid(corp), metadata$did) sapply(colnames(metadata)[-1], function(col){ docvars(corp, field=col) <<- metadata[indices,col] }) save(file=db_file, corp, raw) } dfmat <- tokens(corp, remove_punct = T, remove_symbols = T) |> tokens_remove(pattern=stopwords("en")) |> dfm() topfeatures(dfmat) topfeatures(dfmat[1,]) wprop <- dfm_weight(dfmat, scheme="prop") head(wprop[, 1:2]) topfeatures(wprop[1,]) topfeatures(wprop[3,]) wtfidf <- dfm_tfidf(dfmat) head(wtfidf[,1:3]) topfeatures(wtfidf[1,]) topfeatures(wtfidf[3,])