hce-node application  1.4.3
HCE Hierarchical Cluster Engine node application
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
BoostPlugin.cpp
Go to the documentation of this file.
1 
5 
6 
7 
8 #include <boost/algorithm/string/split.hpp>
9 #include <boost/algorithm/string/classification.hpp>
10 #include <boost/lambda/lambda.hpp>
11 #include <boost/tokenizer.hpp>
12 #include <iostream>
13 #include <cstddef>
14 #include <string>
15 
19 
20 #include "BoostPlugin.hpp"
21 
22 namespace HCE
23 {
24 
25 namespace component
26 {
27 
28 
30  : _defaultDelimiters(u8" \n\t,.?;:-+=().,”’“")
31 {
35  //_defaultDelimiters.append(" \n");
36  _delimiters.append(_defaultDelimiters);
37 
41  ProcessInfo processInfo;
43  _exported.push_back( processInfo );
45  _exported.push_back( processInfo );
46 }
47 
48 
50 
51 
53 {
57  updateDefaultDelimiters(lCData);
58  switch( lCData._processInfo._pluginType )
59  {
60  case BOOST_SPLIT: split(lCData); break;
61  case BOOST_TOKENIZER: tokenizer(lCData); break;
62  default: break;
63  }
64  restoreDefaultDelimiters();
65 }
66 
67 
69 {
70  std::vector<SSTRING> tokens;
71  std::string buffer(lCData._content);
72  boost::split( tokens, lCData._content, boost::is_any_of(_delimiters));
73  for( auto token : tokens)
74  {
75  size_t offset = buffer.find(token);
76  if(offset!=std::string::npos)
77  {
78  buffer.replace(offset, token.length(), token.length(), ' ');
79  lCData._mapOffsets[offset] = token.length();
80  lCData._mapTokens[offset] = token;
81  }
82  }
83 #ifdef _DEBUG_
84  //std::for_each ( tokens.begin(), tokens.end(), std::cout << boost::lambda::_1 << boost::lambda::constant('\n') );
85  std::for_each ( tokens.begin(), tokens.end(), _log_ << boost::lambda::_1 << boost::lambda::constant('\n') );
86  //_log_ << "Boost split" << _log_end_;
87 #endif
88 }
89 
90 
92 {
93  //std::string s(" ");
94  std::string raw_content = lCData.getContent();
95  typedef boost::tokenizer<boost::char_separator<char> > tok_t;
96  boost::char_separator<char> sep(_delimiters.c_str()); //(_tags.c_str(), "| ", boost::keep_empty_tokens);
97  tok_t tok( raw_content, sep );
98  size_t offset = 0;
99  for (tok_t::const_iterator it = tok.begin(), it_end = tok.end();
100  it != it_end; ++it)
101  {
106  /*
107  std::ptrdiff_t const offset = it.base() - str.begin() - it->size();
108  size_t len = it->length();
109 */
110  offset = raw_content.find(*it, offset);
111  size_t len = it->length();
115  lCData._mapOffsets[offset] = len;
116  lCData._mapTokens[offset] = *it;
117  /*
118  std::cout << "<" << *it << "> "
119  << offset << "\t::\t" << len << std::endl;
120  */
121  }
122 #ifdef _DEBUG_
123  std::cout << std::endl << "Boost tokenizer:" << std::endl << std::endl;
127  std::map<WORD_CONTENT_OFFSET, WORD_LENGTH>::const_iterator it1;
128  std::cout << "token \t offset \t length" << std::endl;
129  std::cout << "----- \t ------ \t ------" << std::endl;
130 
131  /*
132  for ( it=lCData._mapTokens.begin(); it!=lCData._mapTokens.end(); ++it )
133  {
134  iso639_1 = lCData._mapLanguages[ it->first ];
135  stemmer = getStemmer( iso639_1 );
136  if(lCData._normalizationId == DEFAULT_NORMALIZATION_WITH && stemmer)
137  {
138  stemmerString = Poco::UTF8::toLower(it->second);
139  const sb_symbol * stemmed = sb_stemmer_stem( stemmer, (const sb_symbol*)stemmerString.c_str(), stemmerString.size() );
140  lCData._mapNormalizedForms[ it->first ] = SSTRING( (const char*)stemmed );
141  }
142  else
143  {
144  lCData._mapNormalizedForms[ it->first ] = SSTRING( it->second );
145  }
146  }
147  */
148 
149 
150  for( it1=lCData._mapOffsets.begin(); it1!=lCData._mapOffsets.end(); ++it1)
151  {
152  std::cout << lCData._mapTokens[it1->first] << '\t'
153  << it1->first << '\t'
154  << it1->second << std::endl;
155  }
156  std::cout << "Tokens count: " << lCData._mapOffsets.size() << std::endl << std::endl;
157 #endif
158 }
159 
160 
161 void BoostPlugin::updateDefaultDelimiters(LCoreData& lCData) {
162  if(lCData._delimiters.size()) {
163  _delimiters.clear();
164  for(auto delimiter : lCData._delimiters) {
165  _delimiters.append(delimiter);
166  }
167  }
168 }
169 
170 
171 void BoostPlugin::restoreDefaultDelimiters() {
172  if(_delimiters != _defaultDelimiters) {
173  _delimiters.clear();
174  _delimiters.append(_defaultDelimiters);
175  }
176 }
177 
178 
179 } /* namespace component */
180 
181 } /* namespace HCE */