library(readr) #' Read bags of words from a file read.words <- function(path) { data <- read_csv(file = path, col_names = 'words', col_types = 'c', progress = FALSE) strsplit(data$words, ' ', fixed = TRUE) } #' For each topic, find documents containing all the words from that topic match <- function(documents, topics) { word.docs <- new.env() for(i in 1:length(documents)) { for(w in documents[[i]]) { j <- length(word.docs[[w]]) + 1 word.docs[[w]][j] <- i } } rslt <- vector("list", length(topics)) for(i in 1:length(topics)) { ds <- lapply(topics[[i]], function(w) word.docs[[w]]) rslt[[i]] <- Reduce(intersect, ds) } rslt } # Load and match documents <- read.words('documents.txt') topics <- read.words('topics.txt') system.time(rslt <- match(documents, topics)) # Print a few summary statistics print(sum(sapply(rslt, length))) print(max(sapply(rslt, length)))