# Recommendation: # Save this file to the same folder as the download_data.R # Use the following menu item: # Session > Set Working Directory > To Source File Location # to set the working directory to the same directory as before # this ensures the database file (if existent) or the text files # will be loaded correctly library(quanteda) library(quanteda.textstats) db_file <- "data.RData" if(file.exists(db_file)){ load(db_file) }else{ library(readtext) raw <- readtext("./data/*.txt", docvarsfrom = "filename", docvarnames = c("gid","prename","name"), dvsep = "_") corp <- corpus(raw) metadata <- read.csv("./data/metadata.tsv", sep="\t") indices <- match(docid(corp), metadata$did) sapply(colnames(metadata)[-1], function(col){ docvars(corp, field=col) <<- metadata[indices,col] }) save(file=db_file, corp, raw) } toks <- tokens(corp, remove_punct = T, remove_symbols = T, padding = T)|> tokens_remove(pattern=stopwords("en"), padding = T)|> tokens_remove(min_nchar=3, padding = T) |> tokens_tolower() coll <- textstat_collocations(toks, min_count = 10) head(coll) tail(coll) coll <- textstat_collocations(toks, size=c(2:3), min_count = 10) head(coll) tail(coll) toks <- tokens_compound(toks, pattern=coll[coll$z >= 7.0]) tfidf <- dfm(toks)|> dfm_tfidf() topfeatures(tfidf[3,]) ?tokens_compound