hce-node application  1.4.3
HCE Hierarchical Cluster Engine node application
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
MecabPlugin.cpp
Go to the documentation of this file.
1 
5 #include <iostream>
6 #include <map>
7 
8 #include "MecabPlugin.hpp"
9 
10 namespace HCE
11 {
12 
13 namespace component
14 {
15 
16 
17 
19 {
23  _toPOS["名詞"] = 0;
24  _toPOS["形容詞"] = 1;
25  _toPOS["動詞"] = 2;
26  _toPOS[""] = 3;
27  _toPOS[""] = 4;
28  _toPOS["数"] = 5;
29  _toPOS["接続詞"] = 6;
30  _toPOS["感動詞"] = 7;
31  _toPOS[""] = 8;
32  _toPOS["助詞"] = 9;
33  _toPOS[""] = 10;
34  _toPOS["副詞"] = 11;
35  _toPOS[""] = 12;
36  _toPOS[""] = 13;
37  _toPOS[""] = 14;
38  //toPOS[""] = 15; /// POSS
39  //toPOS[""] = 16; /// PN_ADJ
40  _toPOS["undefined"] = 15;
41 
42 
43  _toWordType["固有名詞"] = WORD_TYPE::WORD;
44  _toWordType["数"] = WORD_TYPE::NUMBER;
45  _toWordType["サ変接続"] = WORD_TYPE::PUNCTUATION;
46  _toWordType[""] = WORD_TYPE::DELIMITER;
47  _toWordType[""] = WORD_TYPE::UNDEFINED;
48 
49  _tagger = MeCab::createTagger("");
50  /*
51  const char *e = _tagger ? _tagger->what() : MeCab::getTaggerError();
52  std::cerr << "Exception:" << e << std::endl;
53  */
57  ProcessInfo processInfo;
59  _exported.push_back( processInfo );
61  _exported.push_back( processInfo );
63  _exported.push_back( processInfo );
64 }
65 
66 
68 {
69  if(_tagger) delete _tagger;
70 }
71 
72 
74 {
75  switch( lCData._processInfo._pluginType )
76  {
77  case MECAB_NORM: normalize(lCData); break;
78  case MECAB_WTYPE: wordType(lCData); break;
79  case MECAB_TOKENIZE: tokenizer(lCData); break;
80  default: break;
81  }
82 }
83 
84 
85 void MecabPlugin::tokenizer(LCoreData& lCData)
86 {
87  std::string raw_content = lCData.getContent();
88  const char *result = _tagger->parse(raw_content.c_str());
92  if (result)
93  {
97  Poco::StringTokenizer token1(result, "\n", Poco::StringTokenizer::TOK_TRIM | Poco::StringTokenizer::TOK_IGNORE_EMPTY );
101  Poco::StringTokenizer::Iterator string_iterator;
102 
106  for( string_iterator = token1.begin(); string_iterator != token1.end(); ++string_iterator )
107  {
111  Poco::StringTokenizer token2(*string_iterator, "\t,", Poco::StringTokenizer::TOK_TRIM | Poco::StringTokenizer::TOK_IGNORE_EMPTY );
112 
116  size_t offset = 0;
117  if ( isTokenValid( token2 ) )
118  {
122  offset = raw_content.find(token2[0], offset);
123  size_t len = token2[0].length();
127  lCData._mapOffsets[offset] = len;
128  lCData._mapTokens[offset] = token2[0];
129  }
130  }
131  }
132 #ifdef _DEBUG_
133  std::cout << std::endl << "MeCab plugin: tokenizer" << std::endl << std::endl;
137  std::map<WORD_CONTENT_OFFSET, WORD_LENGTH>::const_iterator it1;
138  std::cout << "token \t offset \t length" << std::endl;
139  std::cout << "----- \t ------ \t ------" << std::endl;
140  for( it1=lCData._mapOffsets.begin(); it1!=lCData._mapOffsets.end(); ++it1)
141  {
142  std::cout << lCData._mapTokens[it1->first] << '\t'
143  << it1->first << '\t'
144  << it1->second << std::endl;
145  }
146 #endif
147 }
148 
149 
150 void MecabPlugin::wordType(LCoreData& lCData)
151 {
152  std::map<WORD_CONTENT_OFFSET, WORD_LENGTH>::const_iterator it;
156  for ( it=lCData._mapOffsets.begin(); it!=lCData._mapOffsets.end(); ++it )
157  {
158  const char *result = _tagger->parse(lCData._mapTokens[it->first].c_str());
162  if (result)
163  {
167  Poco::StringTokenizer token1(result, "\n", Poco::StringTokenizer::TOK_TRIM | Poco::StringTokenizer::TOK_IGNORE_EMPTY );
171  Poco::StringTokenizer::Iterator string_iterator;
172 
176  for( string_iterator = token1.begin(); string_iterator != token1.end(); ++string_iterator )
177  {
181  Poco::StringTokenizer token2(*string_iterator, "\t,", Poco::StringTokenizer::TOK_TRIM | Poco::StringTokenizer::TOK_IGNORE_EMPTY );
182 
186  if ( isTokenValid( token2 ) )
187  {
191  lCData._mapWordType[it->first] = getWordType(token2[2]);
192  }
193  }
194  }
195  }
196 #ifdef _DEBUG_
197  std::cout << std::endl << "MeCab plugin: word type" << std::endl << std::endl;
201  std::map<WORD_CONTENT_OFFSET, WORD_LENGTH>::const_iterator it1;
202  std::cout << "token \t WordType" << std::endl;
203  std::cout << "----- \t ------ \t ------" << std::endl;
204  for( it1=lCData._mapOffsets.begin(); it1!=lCData._mapOffsets.end(); ++it1)
205  {
206  //std::cout << lCData._mapTokens[it1->first] << '\t'
207  //<< lCData._mapWordType[it1->first] << std::endl;
208  }
209 #endif
210 }
211 
212 
213 void MecabPlugin::normalize(LCoreData& lCData)
214 {
218  const char* char_src = lCData._content.c_str();
219 
223  if ( char_src )
224  {
229  const char *result = _tagger->parse(char_src);
230 
234  if (result)
235  {
239  Poco::StringTokenizer token1(result, "\n", Poco::StringTokenizer::TOK_TRIM | Poco::StringTokenizer::TOK_IGNORE_EMPTY );
240 
244  long len = 0;
245 
249  Poco::StringTokenizer::Iterator string_iterator;
250 
254  for( string_iterator = token1.begin(); string_iterator != token1.end(); ++string_iterator )
255  {
259  Poco::StringTokenizer token2(*string_iterator, "\t,", Poco::StringTokenizer::TOK_TRIM | Poco::StringTokenizer::TOK_IGNORE_EMPTY );
260 
264  if ( isTokenValid( token2 ) )
265  {
269  std::string word = token2[0];
270 
274  //std::string pos = "undefined";
275  std::string pos = token2[2];
276 
280  std::string norm = token2[0];
281 
285  if ( token2.count() == 10 )
286  {
290  if ( token2[9] != "*" )
291  {
295  pos = token2[1];
296 
300  norm = token2[7]; //( token2[7]=="*" ) ? token2[0] : token2[7];
301  }
302  }
307  lCData._mapOffsets[len] = word.length();
308  lCData._mapTokens[len] = word;
312  lCData._mapPosMasks[len] |= _toPOS[pos];
316  POSMaskBitset<POS_NUM> bset;
317  bset.setMask( _toPOS[pos] );
318  //lCData._mapPosMasks[len] = bset;
319 
320  lCData._mapNormalizedForms[ len ] = norm;
321 
325  lCData._mapWordType[len] = getWordType(token2[2]);
326 
327  len += word.length();
328  }
329  }
330  }
331  }
332 #ifdef _DEBUG_
333  std::cout << std::endl << "MeCab plugin:" << std::endl << std::endl;
337  std::map<WORD_CONTENT_OFFSET, WORD_LENGTH>::const_iterator it1;
338  std::cout << "token \t offset \t length" << std::endl;
339  std::cout << "----- \t ------ \t ------" << std::endl;
340  for( it1=lCData._mapOffsets.begin(); it1!=lCData._mapOffsets.end(); ++it1)
341  {
342  std::cout << lCData._mapTokens[it1->first] << '\t'
343  << it1->first << '\t'
344  << it1->second << std::endl;
345  }
346  std::cout << "Tokens count: " << lCData._mapOffsets.size() << std::endl << std::endl;
347 #endif
348 }
349 
350 
351 } /* namespace component */
352 
353 } /* namespace HCE */