14 #include <cld/ext_lang_enc.h>
16 #include <cld/encodings/proto/encodings.pb.h>
70 toLangByString[
"en"] =
"en";
71 toLangByString[
"ms"] =
"en";
72 toLangByString[
"gl"] =
"en";
73 toLangByString[
"it"] =
"en";
74 toLangByString[
"fil"] =
"en";
75 toLangByString[
"sv"] =
"en";
76 toLangByString[
"ja"] =
"ja";
77 toLangByString[
"zh"] =
"ja";
78 toLangByString[
"zh-TW"] =
"ja";
82 toLangByString[
"ru"] =
"ru";
83 toLangByString[
"be"] =
"ru";
84 toLangByString[
"bg"] =
"ru";
85 toLangByString[
"mk"] =
"ru";
86 toLangByString[
"sr"] =
"ru";
87 toLangByString[
"uk"] =
"ru";
88 toLangByString[
"ur"] =
"ru";
92 toLangByString[
"ca"] =
"en";
110 const char* src = token.c_str();
111 bool is_plain_text =
true;
112 bool do_allow_extended_languages =
true;
113 bool do_pick_summary_language =
false;
114 bool do_remove_weak_matches =
false;
116 const char* tld_hint = NULL;
117 int encoding_hint = UNKNOWN_ENCODING;
118 Language language_hint = UNKNOWN_LANGUAGE;
120 double normalized_score3[3];
126 lang = CompactLangDet::DetectLanguage(0,
129 do_allow_extended_languages,
130 do_pick_summary_language,
131 do_remove_weak_matches,
151 std::cout <<
"basic language: " << _basis_lang_mask<<std::endl;
153 std::map<WORD_CONTENT_OFFSET, SSTRING>::const_iterator it;
163 adjustLanguage(lang);
171 if(lang==UNKNOWN_LANGUAGE) {
173 lang = langAdditionDetection(it->second, lCData.
_mapWordType[it->first ]);
190 std::cout << std::endl <<
"CLD plugin:" << std::endl << std::endl;
194 std::map<WORD_CONTENT_OFFSET, WORD_LENGTH>::const_iterator it1;
195 std::cout <<
"token \t language" << std::endl;
196 std::cout <<
"----- \t ----" << std::endl;
199 std::cout << lCData.
_mapTokens[it1->first] <<
'\t'
202 std::cout <<
"Tokens count: " << lCData.
_mapOffsets.size() << std::endl << std::endl;
210 Language CldPlugin::langAdditionDetection(
const std::string &str,
WORD_TYPE &wType)
213 for(
unsigned int i = 0; i < str.size(); i++)
215 if(str[i] >=
'0' && str[i] <=
'9')
218 wType = (i == 0) ? WORD_TYPE::NUMBER : WORD_TYPE::WORD;