8 #include <boost/algorithm/string/split.hpp>
9 #include <boost/algorithm/string/classification.hpp>
10 #include <boost/lambda/lambda.hpp>
11 #include <boost/tokenizer.hpp>
30 : _defaultDelimiters(u8
" \n\t,.?;:-+=().,”’“")
36 _delimiters.append(_defaultDelimiters);
57 updateDefaultDelimiters(lCData);
64 restoreDefaultDelimiters();
70 std::vector<SSTRING> tokens;
73 for(
auto token : tokens)
75 size_t offset = buffer.find(token);
76 if(offset!=std::string::npos)
78 buffer.replace(offset, token.length(), token.length(),
' ');
85 std::for_each ( tokens.begin(), tokens.end(),
_log_ << boost::lambda::_1 << boost::lambda::constant(
'\n') );
95 typedef boost::tokenizer<boost::char_separator<char> > tok_t;
96 boost::char_separator<char> sep(_delimiters.c_str());
97 tok_t tok( raw_content, sep );
99 for (tok_t::const_iterator it = tok.begin(), it_end = tok.end();
110 offset = raw_content.find(*it, offset);
111 size_t len = it->length();
123 std::cout << std::endl <<
"Boost tokenizer:" << std::endl << std::endl;
127 std::map<WORD_CONTENT_OFFSET, WORD_LENGTH>::const_iterator it1;
128 std::cout <<
"token \t offset \t length" << std::endl;
129 std::cout <<
"----- \t ------ \t ------" << std::endl;
152 std::cout << lCData.
_mapTokens[it1->first] <<
'\t'
153 << it1->first <<
'\t'
154 << it1->second << std::endl;
156 std::cout <<
"Tokens count: " << lCData.
_mapOffsets.size() << std::endl << std::endl;
161 void BoostPlugin::updateDefaultDelimiters(
LCoreData& lCData) {
165 _delimiters.append(delimiter);
171 void BoostPlugin::restoreDefaultDelimiters() {
172 if(_delimiters != _defaultDelimiters) {
174 _delimiters.append(_defaultDelimiters);