|
C++ Clustering LibraryFeatures
ExampleHere is a simple example that use this library :#include <iostream> #include <clustering/API.h> #include <clustering/Parameters.h> int main(int argc, char **argv) { // First of all, create a Parameters object with a similarity // measure and a similarity threshold Clustering::Parameters params(Clustering::Jaccard, 0.3); // Set path used to store temporary files (for distributed sort, // pairs computation, ...) params.setTemporaryPath("/tmp"); // Choose what part of index to put in memory or on disk params.setMainIndexInMemory(); params.setRevertIndexInMemory(); // Number of threads when process is parallel (distributed sort, // pairs computation, ...) params.setNumberOfThread(3); // Now we can construct the API object using Parameters Clustering::API api(params); // Creation of a new document that contains 2 words api.addDocument("doc1"); api.addDescriptorInCurrentDocument("word1"); api.addDescriptorInCurrentDocument("word2"); // Creation of a new document that contains 2 words api.addDocument("doc2"); api.addDescriptorInCurrentDocument("word1"); api.addDescriptorInCurrentDocument("word2"); // Creation of a new document that contains 3 words api.addDocument("doc3"); api.addDescriptorInCurrentDocument("word3"); api.addDescriptorInCurrentDocument("word4"); api.addDescriptorInCurrentDocument("word5"); // Creation of a new document that contains 3 words api.addDocument("doc4"); api.addDescriptorInCurrentDocument("word3"); api.addDescriptorInCurrentDocument("word4"); api.addDescriptorInCurrentDocument("word5"); // Creation of a new document that contains 3 words api.addDocument("doc5"); api.addDescriptorInCurrentDocument("word5"); api.addDescriptorInCurrentDocument("word6"); api.addDescriptorInCurrentDocument("word7"); // Creation of a new document that contains 4 words api.addDocument("doc6"); api.addDescriptorInCurrentDocument("word4"); api.addDescriptorInCurrentDocument("word5"); api.addDescriptorInCurrentDocument("word6"); api.addDescriptorInCurrentDocument("word7"); // launch algorithm const Clustering::Results &res = api.startAlgorithm(); // iterates over clusters Clustering::DocClusterIterator it = res.begin(), eit = res.end(); unsigned clusterCnt = 0; while (it != eit) { std::cout << "Cluster[" << ++clusterCnt << "]" << std::endl; // Get Documents of this cluster const std::vector<Clustering::DocId> &documents = it->getDocs(); // Get decriptors of this cluster const std::vector<Clustering::DescId> &descriptors = it->getDescs(); unsigned int i = 0; // We must use api to convert DocId and DescId in string // Display documents in cluster for (i = 0; i < documents.size(); ++i) . std::cout << " Document[" << i + 1 << "] : " . . << api.getDocument(documents[i]) << std::endl; // We can display all descriptors present in this cluster for (i = 0; i < descriptors.size(); ++i) . std::cout << " Descriptor[" << i + 1 << "] Freq[" << descriptors[i].getFrequency() . . << "]: " << api.getDescriptor(descriptors[i]) << std::endl; // We can also choose to display only the N best descriptors std::vector<Clustering::DescId> bestDescriptors = . api.getBestDesc(*it, 2, 0.1); for (i = 0; i < bestDescriptors.size(); ++i) . std::cout << " Best Descriptor[" << i + 1 << "] Freq[" << bestDescriptors[i].getFrequency() . . << "]: " << api.getDescriptor(bestDescriptors[i]) << std::endl; ++it; } } This example produces the following output : Cluster[1] Document[1] : doc3 Document[2] : doc4 Document[3] : doc5 Document[4] : doc6 Descriptor[1] Freq[2]: word3 Descriptor[2] Freq[3]: word4 Descriptor[3] Freq[4]: word5 Descriptor[4] Freq[2]: word6 Descriptor[5] Freq[2]: word7 Best Descriptor[1] Freq[4]: word5 Best Descriptor[2] Freq[3]: word4 Cluster[2] Document[1] : doc1 Document[2] : doc2 Descriptor[1] Freq[2]: word1 Descriptor[2] Freq[2]: word2 Best Descriptor[1] Freq[2]: word1 Best Descriptor[2] Freq[2]: word2 |