|
C++ Tokenizer LibraryFeatures
Simple ExampleHere is a simple example that use this library :#include <iostream> #include "tokenizer/Tokenizer.h" #include "tools/Exception.h" int main(int argc, char **argv) { // Buffer to analyze, if content is into a file it must be read and // allocated in memory before tokenization std::string buffer("an example to tokenize"); // Allocate a tokenizer without wiki synthax support (second // argument) which generate single word (with a minimal size of 3) Tokenizer::Tokenizer tokenizer(1, 3); tokenizer.setBufferToAnalyze(buffer.c_str(), buffer.size()); try { while (!tokenizer.eof()) . { . // Get the token . Tokenizer::Token tok = tokenizer.getNextToken(); . std::cout << std::string(tok.getContent(), tok.getSize()) << std::endl; . } } catch (ToolBox::EOFException &e) { // End of buffer reached during parsing, there is not enough // words to create a token } } This example produces the following output : Exemple tokenize Multi-Word ExampleHere is a multi-words example that use this library :#include <iostream> #include "tokenizer/Tokenizer.h" #include "tools/Exception.h" int main(int argc, char **argv) { // Buffer to analyze, if content is into a file it must be read and // allocated in memory before tokenization std::string buffer("Just an example to tokenize."); // Allocate a tokenizer without wiki synthax support (second // argument) which generate three-words tokens with a minal size of 2 Tokenizer::Tokenizer tokenizer(3, 2); tokenizer.setBufferToAnalyze(buffer.c_str(), buffer.size()); try { while (!tokenizer.eof()) . { . // Get the token . Tokenizer::Token tok = tokenizer.getNextToken(); . std::cout << std::string(tok.getContent(), tok.getSize()) << std::endl; . } } catch (ToolBox::EOFException &e) { // End of buffer reached during parsing, there is not enough // words to create a token } } This example produces the following output : Just an example an example to example to tokenize Wikipedia ExampleHere is a wikipedia-based example that use this library :int main(int argc, char **argv) { // Buffer to analyze, if content is into a file it must be read and // allocated in memory before tokenization std::string buffer("Exemple {{wiki synthax to skip}}to [[another wiki]]tokenize"); // Allocate a tokenizer with wiki synthax support (second // argument) which generate single word (with a minimal size of 3) Tokenizer::Tokenizer tokenizer(1, 3); tokenizer.setBufferToAnalyze(buffer.c_str(), buffer.size(), true); try { while (!tokenizer.eof()) . { . // Get the token . Tokenizer::Token tok = tokenizer.getNextToken(); . std::cout << std::string(tok.getContent(), tok.getSize()) << std::endl; . } } catch (ToolBox::EOFException &e) { // End of buffer reached during parsing, there is not enough // words to create a token } } This example produces the following output : Exemple tokenize |