library(readr)


#' Read bags of words from a file
read.words <- function(path) {
  data <- read_csv(file = path, col_names = 'words', col_types = 'c', progress = FALSE)
  strsplit(data$words, ' ', fixed = TRUE)
}


#' For each topic, find documents containing all the words from that topic
match <- function(documents, topics) {
  word.docs <- new.env()
  for(i in 1:length(documents)) {
    for(w in documents[[i]]) {
      j <- length(word.docs[[w]]) + 1
      word.docs[[w]][j] <- i
    }
  }

  rslt <- vector("list", length(topics))
  for(i in 1:length(topics)) {
    ds <- lapply(topics[[i]], function(w) word.docs[[w]])
    rslt[[i]] <- Reduce(intersect, ds)
  }
  rslt
}


# Load and match
documents <- read.words('documents.txt')
topics <- read.words('topics.txt')
system.time(rslt <- match(documents, topics))

# Print a few summary statistics
print(sum(sapply(rslt, length)))
print(max(sapply(rslt, length)))