40 _toPOS[
"undefined"] = 15;
43 _toWordType[
"固有名詞"] = WORD_TYPE::WORD;
44 _toWordType[
"数"] = WORD_TYPE::NUMBER;
45 _toWordType[
"サ変接続"] = WORD_TYPE::PUNCTUATION;
46 _toWordType[
""] = WORD_TYPE::DELIMITER;
47 _toWordType[
""] = WORD_TYPE::UNDEFINED;
49 _tagger = MeCab::createTagger(
"");
69 if(_tagger)
delete _tagger;
85 void MecabPlugin::tokenizer(
LCoreData& lCData)
88 const char *result = _tagger->parse(raw_content.c_str());
97 Poco::StringTokenizer token1(result,
"\n", Poco::StringTokenizer::TOK_TRIM | Poco::StringTokenizer::TOK_IGNORE_EMPTY );
101 Poco::StringTokenizer::Iterator string_iterator;
106 for( string_iterator = token1.begin(); string_iterator != token1.end(); ++string_iterator )
111 Poco::StringTokenizer token2(*string_iterator,
"\t,", Poco::StringTokenizer::TOK_TRIM | Poco::StringTokenizer::TOK_IGNORE_EMPTY );
117 if ( isTokenValid( token2 ) )
122 offset = raw_content.find(token2[0], offset);
123 size_t len = token2[0].length();
133 std::cout << std::endl <<
"MeCab plugin: tokenizer" << std::endl << std::endl;
137 std::map<WORD_CONTENT_OFFSET, WORD_LENGTH>::const_iterator it1;
138 std::cout <<
"token \t offset \t length" << std::endl;
139 std::cout <<
"----- \t ------ \t ------" << std::endl;
142 std::cout << lCData.
_mapTokens[it1->first] <<
'\t'
143 << it1->first <<
'\t'
144 << it1->second << std::endl;
150 void MecabPlugin::wordType(LCoreData& lCData)
152 std::map<WORD_CONTENT_OFFSET, WORD_LENGTH>::const_iterator it;
156 for ( it=lCData._mapOffsets.begin(); it!=lCData._mapOffsets.end(); ++it )
158 const char *result = _tagger->parse(lCData._mapTokens[it->first].c_str());
167 Poco::StringTokenizer token1(result,
"\n", Poco::StringTokenizer::TOK_TRIM | Poco::StringTokenizer::TOK_IGNORE_EMPTY );
171 Poco::StringTokenizer::Iterator string_iterator;
176 for( string_iterator = token1.begin(); string_iterator != token1.end(); ++string_iterator )
181 Poco::StringTokenizer token2(*string_iterator,
"\t,", Poco::StringTokenizer::TOK_TRIM | Poco::StringTokenizer::TOK_IGNORE_EMPTY );
186 if ( isTokenValid( token2 ) )
191 lCData._mapWordType[it->first] = getWordType(token2[2]);
197 std::cout << std::endl <<
"MeCab plugin: word type" << std::endl << std::endl;
201 std::map<WORD_CONTENT_OFFSET, WORD_LENGTH>::const_iterator it1;
202 std::cout <<
"token \t WordType" << std::endl;
203 std::cout <<
"----- \t ------ \t ------" << std::endl;
204 for( it1=lCData._mapOffsets.begin(); it1!=lCData._mapOffsets.end(); ++it1)
213 void MecabPlugin::normalize(LCoreData& lCData)
218 const char* char_src = lCData._content.c_str();
229 const char *result = _tagger->parse(char_src);
239 Poco::StringTokenizer token1(result,
"\n", Poco::StringTokenizer::TOK_TRIM | Poco::StringTokenizer::TOK_IGNORE_EMPTY );
249 Poco::StringTokenizer::Iterator string_iterator;
254 for( string_iterator = token1.begin(); string_iterator != token1.end(); ++string_iterator )
259 Poco::StringTokenizer token2(*string_iterator,
"\t,", Poco::StringTokenizer::TOK_TRIM | Poco::StringTokenizer::TOK_IGNORE_EMPTY );
264 if ( isTokenValid( token2 ) )
269 std::string word = token2[0];
275 std::string pos = token2[2];
280 std::string norm = token2[0];
285 if ( token2.count() == 10 )
290 if ( token2[9] !=
"*" )
307 lCData._mapOffsets[len] = word.length();
308 lCData._mapTokens[len] = word;
312 lCData._mapPosMasks[len] |= _toPOS[pos];
316 POSMaskBitset<POS_NUM> bset;
317 bset.setMask( _toPOS[pos] );
320 lCData._mapNormalizedForms[ len ] = norm;
325 lCData._mapWordType[len] = getWordType(token2[2]);
327 len += word.length();
333 std::cout << std::endl <<
"MeCab plugin:" << std::endl << std::endl;
337 std::map<WORD_CONTENT_OFFSET, WORD_LENGTH>::const_iterator it1;
338 std::cout <<
"token \t offset \t length" << std::endl;
339 std::cout <<
"----- \t ------ \t ------" << std::endl;
340 for( it1=lCData._mapOffsets.begin(); it1!=lCData._mapOffsets.end(); ++it1)
342 std::cout << lCData._mapTokens[it1->first] <<
'\t'
343 << it1->first <<
'\t'
344 << it1->second << std::endl;
346 std::cout <<
"Tokens count: " << lCData._mapOffsets.size() << std::endl << std::endl;