highlighter application  1.1
HCE project utils : highlighter
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
CldPlugin.cpp
Go to the documentation of this file.
1 
14 #include <cld/ext_lang_enc.h>
15 #include <iostream>
16 #include <cld/encodings/proto/encodings.pb.h>
17 
18 #include "CldPlugin.hpp"
19 #include "CWords.hpp"
20 
21 namespace HCE
22 {
23 
24 namespace component
25 {
26 
27 
29 {
34  for ( unsigned i=0; i<MAX_NUM_LANG; ++i)
35  {
36  toLang[ static_cast<Language>(i) ] = LM_NONE;
37  }
38 
43  toLang[ static_cast<Language>(0) ] = LM_ENGLISH;
44  toLang[ static_cast<Language>(7) ] = LM_ENGLISH;
45  toLang[ static_cast<Language>(31) ] = LM_ENGLISH;
46  toLang[ static_cast<Language>(32) ] = LM_ENGLISH;
47  toLang[ static_cast<Language>(40) ] = LM_ENGLISH;
48  toLang[ static_cast<Language>(8) ] = LM_JAPANESE;
49  toLang[ static_cast<Language>(13) ] = LM_RUSSIAN;
50  toLang[ static_cast<Language>(27) ] = LM_RUSSIAN;
51 
52  _adjLang[ static_cast<Language>(13) ] = ::RUSSIAN;
53  _adjLang[ static_cast<Language>(27) ] = ::RUSSIAN;
54  _adjLang[ static_cast<Language>(29) ] = ::RUSSIAN;
55  _adjLang[ static_cast<Language>(34) ] = ::RUSSIAN;
56  _adjLang[ static_cast<Language>(36) ] = ::RUSSIAN;
57  _adjLang[ static_cast<Language>(37) ] = ::RUSSIAN;
58  _adjLang[ static_cast<Language>(47) ] = ::RUSSIAN;
59  _adjLang[ static_cast<Language>(50) ] = ::RUSSIAN;
60 
64  toLang[ static_cast<Language>(CATALAN) ] = LM_ENGLISH;
65 
70  toLangByString["en"] = "en";
71  toLangByString["ms"] = "en";
72  toLangByString["gl"] = "en";
73  toLangByString["it"] = "en";
74  toLangByString["fil"] = "en";
75  toLangByString["sv"] = "en";
76  toLangByString["ja"] = "ja";
77  toLangByString["zh"] = "ja";
78  toLangByString["zh-TW"] = "ja";
82  toLangByString["ru"] = "ru";
83  toLangByString["be"] = "ru";
84  toLangByString["bg"] = "ru";
85  toLangByString["mk"] = "ru";
86  toLangByString["sr"] = "ru";
87  toLangByString["uk"] = "ru";
88  toLangByString["ur"] = "ru";
92  toLangByString["ca"] = "en";
93 
97  ProcessInfo processInfo;
99  _exported.push_back( processInfo );
100 }
101 
102 
104 
105 
106 template <class T>
107 inline
108 const Language detectLanguage(const T& token)
109 {
110  const char* src = token.c_str();
111  bool is_plain_text = true;
112  bool do_allow_extended_languages = true;
113  bool do_pick_summary_language = false;
114  bool do_remove_weak_matches = false;
115  bool is_reliable;
116  const char* tld_hint = NULL;
117  int encoding_hint = UNKNOWN_ENCODING;
118  Language language_hint = UNKNOWN_LANGUAGE;
119 
120  double normalized_score3[3];
121  Language language3[3];
122  int percent3[3];
123  int text_bytes;
124 
125  Language lang;
126  lang = CompactLangDet::DetectLanguage(0,
127  src, strlen(src),
128  is_plain_text,
129  do_allow_extended_languages,
130  do_pick_summary_language,
131  do_remove_weak_matches,
132  tld_hint,
133  encoding_hint,
134  language_hint,
135  language3,
136  percent3,
137  normalized_score3,
138  &text_bytes,
139  &is_reliable);
140  return lang;
141 }
142 
143 
145 {
149  _basis_lang_mask = detectLanguage(lCData._content);
150 #ifdef _DEBUG_
151  std::cout << "basic language: " << _basis_lang_mask<<std::endl;
152 #endif
153  std::map<WORD_CONTENT_OFFSET, SSTRING>::const_iterator it;
154  for ( it=lCData._mapTokens.begin(); it!=lCData._mapTokens.end(); ++it )
155  {
159  Language lang = detectLanguage( it->second );
163  adjustLanguage(lang);
167  //SSTRING iso639_1 = LanguageCode(lang);
168  //std::string lstr = toLangByString[iso639_1];
169  //lCData._mapLanguages[ it->first ] = (lstr=="") ? "en" : lstr;
170 
171  if(lang==UNKNOWN_LANGUAGE) {
172  lCData._mapWordType[it->first ] = WORD_TYPE::DELIMITER;
173  lang = langAdditionDetection(it->second, lCData._mapWordType[it->first ]);
174  } else {
175  lCData._mapWordType[it->first ] = WORD_TYPE::WORD;
176  }
180  lCData._mapLanguages[ it->first ] = correctLanguage(lang);
181  lCData._mapLanguagesMasks[ it->first ] |= toLang[lang] | _basis_lang_mask<<1;
185  lCData._mapVectorsOfLanguages[ static_cast<Language>( toLang[lang] ) ].push_back( it->first );
186 
187 
188  }
189 #ifdef _DEBUG_
190  std::cout << std::endl << "CLD plugin:" << std::endl << std::endl;
194  std::map<WORD_CONTENT_OFFSET, WORD_LENGTH>::const_iterator it1;
195  std::cout << "token \t language" << std::endl;
196  std::cout << "----- \t ----" << std::endl;
197  for( it1=lCData._mapOffsets.begin(); it1!=lCData._mapOffsets.end(); ++it1)
198  {
199  std::cout << lCData._mapTokens[it1->first] << '\t'
200  << lCData._mapLanguages[it1->first] << std::endl;
201  }
202  std::cout << "Tokens count: " << lCData._mapOffsets.size() << std::endl << std::endl;
203 #endif
204 }
205 
210 Language CldPlugin::langAdditionDetection(const std::string &str, WORD_TYPE &wType)
211 {
212  Language ret = UNKNOWN_LANGUAGE;
213  for(unsigned int i = 0; i < str.size(); i++)
214  {
215  if(str[i] >= '0' && str[i] <= '9')
216  {
217  ret = ::ENGLISH;
218  wType = (i == 0) ? WORD_TYPE::NUMBER : WORD_TYPE::WORD;
219  break;
220  }
221  }
222  return ret;
223 }
224 
225 
226 } /* namespace component */
227 
228 } /* namespace HCE */