HCE project C++ developers source code library  1.1.1
HCE project developer library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
Refine.cpp
Go to the documentation of this file.
1 
14 
15 #include <iostream>
16 #include <cld/compact_lang_det.h>
17 
18 #include "Refine.hpp"
19 #include "OutDataRefine.hpp"
20 #include "RWords.hpp"
21 #include "CWords.hpp"
22 #include "LCoreData.hpp"
23 #include "HCETimer.hpp"
24 
25 
26 namespace HCE
27 {
28 
29 namespace component
30 {
31 
32 
33 Refine::Refine(ComponentType inType) : _lCore()
34 {
38  ProcessInfo processInfo;
39 
55 
56 
57 
58  SET_PROCESS_INFO_L(processInfo, TAGS, TAG_REDUCE, LM_ALL )
59  _processInfoTable.push_back( processInfo );
60 
69  //SET_PROCESS_INFO_L(processInfo, SPLIT, ICU_BOUNDARIES, LM_ALL)
70  //_processInfoTable.push_back( processInfo );
72  //SET_PROCESS_INFO_L(processInfo, SPLIT, BOOST_SPLIT, LM_ALL)
73  //_processInfoTable.push_back( processInfo );
76  _processInfoTable.push_back( processInfo );
78  //SET_PROCESS_INFO_L(processInfo, NORM, MECAB_NORM, LM_ALL )
79  //_processInfoTable.push_back( processInfo );
80 
85  _processInfoTable.push_back( processInfo );
86 
90  //SET_PROCESS_INFO_L(processInfo, NORM, MECAB_NORM, LM_JAPANESE )
91  //SET_PROCESS_INFO_L(processInfo, NORM, MECAB_NORM, LM_ALL )
92  //_processInfoTable.push_back( processInfo );
93 
98  _processInfoTable.push_back( processInfo );
99 
103  SET_PROCESS_INFO_L(processInfo, POS, TURGLEM_POS, LM_ALL )
104  _processInfoTable.push_back( processInfo );
105 
109  SET_PROCESS_INFO_L(processInfo, HCRC, FAKE_HCRC, LM_ALL )
110  _processInfoTable.push_back( processInfo );
111 }
112 
113 
115 
116 
117 Poco::SharedPtr<DataBase> Refine::process(const Poco::SharedPtr<DataBase> inData)
118 {
122 #ifdef _DEBUG_
123  HCE::HCETimer hceTimer("Parsing content time: ");
124 #endif
125 
129  LCoreData lCData( inData.cast<InDataRefine>()->getContent() );
130  lCData._normalizationId = inData.cast<InDataRefine>()->getNormalizationId();
131  lCData._delimiters = inData.cast<InDataRefine>()->getDelimiters();
132  lCData._granularityLevel= inData.cast<InDataRefine>()->getGranularityLevel();
133  std::vector<ProcessInfo> ProcessInfoTable = \
134  ( inData.cast<InDataRefine>()->getProcessInfoTable().size() ) ? \
135  inData.cast<InDataRefine>()->getProcessInfoTable() :\
136  this->_processInfoTable;
137 
141  //BOOST_FOREACH( ProcessInfo& processInfo, ProcessInfoTable )
142  std::vector<ProcessInfo>::iterator PI_iter;
143  for ( PI_iter = ProcessInfoTable.begin(); PI_iter != ProcessInfoTable.end(); ++PI_iter )
144  {
145 #ifdef _DEBUG_
146  std::cout << "------------------------" << std::endl;
147 #endif
148  lCData._processInfo = *PI_iter; //processInfo;
149  _lCore.selectMethod(lCData);
150  /*processInfo*/PI_iter->_status = lCData._processInfo._status;
151  }
155  //_log_ << "Tokens count" << lCData._mapOffsets.size() << _log_end_;
156 
160  Poco::SharedPtr<HCE::DataBase> outDataRefine(new HCE::OutDataRefine(HCE::CT_REFINE) );
161  std::vector < CWords > cWords;
162  std::vector < RWords > rWords;
163  std::map<WORD_CONTENT_OFFSET, WORD_LENGTH>::const_iterator it;
164 #ifdef _DEBUG_
165  std::cout << "token \t norm \t offset \t length \t pos \t hCrc64" << std::endl;
166  std::cout << "----- \t ---- \t ------ \t ------ \t --- \t ------" << std::endl;
167 #endif
168 
172  for ( it=lCData._mapOffsets.begin(); it!=lCData._mapOffsets.end(); ++it )
173  {
177  CWords cword;
191 
192 
193 
194  cword.setBlack( lCData._mapBlack[it->first] );
195 
199  cword.setSimClass( lCData._mapMorphology[it->first].getMask() );
200 
204  //cword.setHCrc( lCData._mapHCRC64[ it->first ] );
205 
209  cword.setOffset( it->first );
210 
214  cword.setSentenceNumber( lCData._mapSentences[ it->first ] );
215 
219  cword.setLingIntegrity( lCData._mapLingIntegrity[ it->first ] );
220 
224  cword.setInitWordLen( it->second );
225 
229  cword.setOriginWord( lCData._mapTokens[it->first] );
230 
234  cword.setNormWord( lCData._mapNormalizedForms[it->first] );
235 
239  //cword.setPosMask ( lCData._mapPosMasks[ it->first ].getMask() );
240  cword.setPosMask ( lCData._mapPosMasks[ it->first ]);
241 
245  cword.setWordType(lCData._mapWordType[ it->first ]);
246 
250  cWords.push_back( cword );
251 
255  RWords rword;
264 
265 
266 
267  rword.setNormWord ( lCData._mapNormalizedForms[it->first] );
268  rword.setCrc64 ( lCData._mapHCRC64[ it->first ] );
269  //rword.setPosMask ( lCData._mapPosMasks[ it->first ].getMask() );
270  rword.setPosMask ( lCData._mapPosMasks[ it->first ] );
271  rword.setMorphChangeGradMask ( lCData._mapMorphology[ it->first ].getMask() );
272  rWords.push_back(rword);
273 
274 #ifdef _DEBUG_
275  std::cout << lCData._mapTokens[it->first] << '\t'
276  << lCData._mapNormalizedForms[it->first] << '\t'
277  << it->first << '\t'
278  << lCData._mapOffsets[it->first] << '\t'
279  //<< lCData._mapPosMasks[it->first].getMask() << '\t'
280  << lCData._mapPosMasks[it->first]<< '\t'
281  << lCData._mapHCRC64[it->first] << std::endl;
282 #endif
283  }
284  unsigned int langMask = 0;
285  unsigned char err = 0;
286  outDataRefine.cast<OutDataRefine>()->setCWords(cWords);
287  outDataRefine.cast<OutDataRefine>()->setRWords(rWords);
288  outDataRefine.cast<OutDataRefine>()->setLangMask(langMask);
289  outDataRefine.cast<OutDataRefine>()->setLocalErrorCode(err);
290 
291  return outDataRefine;
292 }
293 
294 
295 } /* namespace component */
296 
297 } /* namespace HCE */