#include #include #include #include #include #include #include using namespace std; using namespace std::chrono; using Tokens = vector; using Corpus = vector; using Indices = vector; /// Split a string into tokens Tokens split(const string& str, const char* delim = " ") { Tokens tokens; size_t end = 0; for(size_t start = 0; end != string::npos; start = end + 1) { end = str.find(delim, start); tokens.push_back(end == string::npos ? str.substr(start) : str.substr(start, end - start)); } return tokens; } /// Read bags of words from a file Corpus read_words(const char* path) { ifstream file(path); string line; Corpus data; while(getline(file, line)) { data.push_back(split(line)); } return data; } /// For each topic, find documents containing all the words from that topic vector match(const Corpus& documents, const Corpus& topics) { unordered_map word_docs; for(size_t i = 0; i < documents.size(); i++) { for(auto& w: documents[i]) { word_docs[w].push_back(i); } } // Strictly speaking, sorting is not required in single-threaded code for(auto& kv: word_docs) { sort(kv.second.begin(), kv.second.end()); } vector rslt(topics.size()); size_t i = 0; for(auto& t: topics) { auto& x = rslt[i++]; size_t j = 0; for(auto& w: t) { auto& y = word_docs[w]; if(j++ == 0) { x = y; } else { Indices buf(min(x.size(), y.size())); auto end = set_intersection(x.begin(), x.end(), y.begin(), y.end(), buf.begin()); x = Indices(buf.begin(), end); } } } return rslt; } int main() { // Load data auto topics = read_words("topics.txt"); auto documents = read_words("documents.txt"); // Match data auto start = system_clock::now(); auto rslt = match(documents, topics); auto end = system_clock::now(); auto elapsed = duration(end - start).count(); printf("Elapsed time: %0.2fs\n", elapsed); // Print a few summary statistics size_t s1 = 0; size_t s2 = 0; for(auto& d: rslt) { s1 += d.size() > 0 ? 1 : 0; s2 = max(s2, d.size()); } printf("%i\n%i\n", s1, s2); return 0; }