highlighter application  1.1
HCE project utils : highlighter
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
TurglemPlugin.cpp
Go to the documentation of this file.
1 
6 #include "TurglemPlugin.hpp"
7 
8 
9 namespace HCE
10 {
11 
12 namespace component
13 {
14 
15 
17 {
21  ProcessInfo processInfo;
23  _exported.push_back( processInfo );
24 
28  _lem[LM_ENGLISH].load_lemmatizer("/usr/share/turglem/english/dict_english.auto",
29  "/usr/share/turglem/english/paradigms_english.bin",
30  "/usr/share/turglem/english/prediction_english.auto");
31  _lem[LM_RUSSIAN].load_lemmatizer("/usr/share/turglem/russian/dict_russian.auto",
32  "/usr/share/turglem/russian/paradigms_russian.bin",
33  "/usr/share/turglem/russian/prediction_russian.auto");
34 }
35 
36 
38 {
39 }
40 
41 
43 {
47  std::map<WORD_CONTENT_OFFSET, SSTRING>::const_iterator it;
48  for ( it=lCData._mapTokens.begin(); it!=lCData._mapTokens.end(); ++it )
49  {
53  LanguageMask languageMask = lCData._mapLanguagesMasks[it->first];
54  for ( Language lang=ENGLISH; lang!=MAX_NUM_LANG/*THAI*/; ++lang )
55  {
59  if ( languageMask&lang )
60  {
64  std::map<Language, tl::lemmatizer>::const_iterator lit;
65  lit = _lem.find( lang );
66  if ( lit!=_lem.end() )
67  {
71  size_t sz_lem = 0;
72  tl::lem_result lr;
73  switch ( lang )
74  {
75  case LM_ENGLISH:
76  sz_lem = lit->second.lemmatize<english_utf8_adapter>(it->second.c_str(), lr);
77  break;
78  case LM_RUSSIAN:
79  sz_lem = lit->second.lemmatize<russian_utf8_adapter>(it->second.c_str(), lr);
80  break;
81  default: break;
82  }
83  if (sz_lem)
84  {
85  for (size_t i = 0; i < sz_lem; i++)
86  {
87  //int src = -1;
88  POSMask pos;
89  //int paradigm = -1;
90  std::string nform;
91  switch ( lit->first )
92  {
93  case LM_ENGLISH:
94  nform = lit->second.get_text<english_utf8_adapter>(lr, i, 0);
95  break;
96  case LM_RUSSIAN:
97  nform = lit->second.get_text<russian_utf8_adapter>(lr, i, 0);
98  break;
99  default: break;
100  }
104  pos = static_cast<POSMask>(lit->second.get_part_of_speech(lr, i, 0));
105 
109  //paradigm = lit->second.get_paradigm(lr, i);
110 
114  //src = lit->second.get_src_form(lr, i);
115 
119 
123  lCData._mapPosMasks[it->first] |= 1 << pos;
128  bset.setMask(pos);
129  //lCData._mapPosMasks[it->first] = bset;
130 
131  //lCData._mapNormalizedForms[ it->first ] = nform;
132  }
133  }
134  }
135  }
136  }
137  }
138 #ifdef _DEBUG_
139  std::cout << std::endl << "TurglemPlugin normalize" << std::endl << std::endl;
143  std::map<WORD_CONTENT_OFFSET, WORD_LENGTH>::const_iterator it1;
144  std::cout << "token \t pos" << std::endl;
145  std::cout << "----- \t ---" << std::endl;
146  for( it1=lCData._mapOffsets.begin(); it1!=lCData._mapOffsets.end(); ++it1)
147  {
148  std::cout << lCData._mapTokens[it1->first] << '\t'
149  //<< lCData._mapPosMasks[it1->first].getMask()
150  << lCData._mapPosMasks[it1->first]
151  << std::endl;
152  }
153  std::cout << "Tokens count: " << lCData._mapOffsets.size() << std::endl << std::endl;
154 #endif
155 }
156 
157 
158 } /* namespace component */
159 
160 } /* namespace HCE */
161