3 HCE project, Python bindings, Distributed Tasks Manager application. 4 ScraperLangDetector Class content main functional detect lang. 7 @file ScraperLangDetector.py 8 @author Alexander Vybornyh <alexander.hce.cluster@gmail.com> 9 @link: http://hierarchical-cluster-engine.com/ 10 @copyright: Copyright © 2013-2017 IOIX Ukraine 11 @license: http://hierarchical-cluster-engine.com/license/ 22 MSG_ERROR_LANG_DETECT =
"Language detection failed. Error: %s" 26 PROPERTY_OPTION_PREFIX =
"prefix" 27 PROPERTY_OPTION_SUFFIX =
"suffix" 28 PROPERTY_OPTION_TAGS =
"tags" 29 PROPERTY_OPTION_MAPS =
"maps" 30 PROPERTY_OPTION_SIZE =
"size" 32 DEFAULT_VALUE_OPTION_PREFIX =
"" 33 DEFAULT_VALUE_OPTION_SUFFIX =
"_lang" 34 DEFAULT_VALUE_OPTION_TAGS = []
35 DEFAULT_VALUE_OPTION_MAPS = {
"en": [
"fr",
"nl",
"ro",
"af",
"ca",
"it",
"da",
"tl",
"et",
"cy",
"sv",
"id",
"es",
"*" ], \
36 "ja": [
"ja",
"zh",
"za" ], \
37 "ru": [
"ru",
"uk" ], \
40 DEFAULT_VALUE_OPTION_SIZE = 1024
41 DEFAULT_VALUE_SUMMARY_LANG =
"en" 43 DEFAULT_VALUE_LANG_MAPPING =
'*' 45 DEFAULT_VALUE_TAGS_NAMES = [CONSTS.TAG_MEDIA, CONSTS.TAG_TITLE, CONSTS.TAG_LINK, CONSTS.TAG_DESCRIPTION, CONSTS.TAG_PUB_DATE, CONSTS.TAG_DC_DATE, \
46 CONSTS.TAG_AUTHOR, CONSTS.TAG_CONTENT_UTF8_ENCODED, CONSTS.TAG_KEYWORDS]
48 TAGS_EXTENDED_VALUE_ALL =
"*" 49 TAGS_EXTENDED_VALUE_SUMMARY =
"&" 51 SCRAPER_RESULT_TAG_OPTION_DATA =
"data" 52 SCRAPER_RESULT_TAG_OPTION_LANG =
"lang" 53 SCRAPER_RESULT_TAG_OPTION_SUMMARY_LANG =
"summary_lang" 64 if isinstance(scraperLangDetectProperty, dict):
104 if incomeBuf
is not None and incomeBuf !=
"":
106 from langdetect
import detect
107 ret = detect(incomeBuf.decode(
'utf-8')).replace(
'-',
',')
108 except Exception, err:
110 log.error(ScraperLangDetector.MSG_ERROR_LANG_DETECT, str(err))
111 log.debug(Utils.getTracebackInfo())
125 if response
is not None and tagName
in response.tags:
126 if isinstance(response.tags[tagName], basestring):
127 ret = response.tags[tagName]
129 elif isinstance(response.tags[tagName], dict)
and \
130 ScraperLangDetector.SCRAPER_RESULT_TAG_OPTION_DATA
in response.tags[tagName]:
131 if isinstance(response.tags[tagName][ScraperLangDetector.SCRAPER_RESULT_TAG_OPTION_DATA], basestring):
132 ret = response.tags[tagName][ScraperLangDetector.SCRAPER_RESULT_TAG_OPTION_DATA]
134 elif isinstance(response.tags[tagName][ScraperLangDetector.SCRAPER_RESULT_TAG_OPTION_DATA], list):
136 for elem
in response.tags[tagName][ScraperLangDetector.SCRAPER_RESULT_TAG_OPTION_DATA]:
151 buff = text
if len(text) <= self.
size else text[:self.
size]
160 log.debug(
"buffer len = %s was trancated to len = %s used limit = %s", str(len(text)), str(len(buff)), str(self.
size))
180 lang = ScraperLangDetector.langDetect(text,
False, log)
182 log.debug(
"for '%s' was detected '%s'", str(tagName), str(lang))
184 if lang
is not None and isinstance(response.tags[tagName], dict):
186 response.tags[tagName][fieldName] = lang
211 if default
is not None:
223 return len([s
for s
in src
if val
in s]) > 0
232 if response
is not None:
236 for tagName
in response.tags:
243 localTextResult =
None 244 for tagName
in response.tags:
247 if localTextValue
is not None:
248 localTextResult += localTextValue
249 localTextResult +=
' ' 250 localTextResult = localTextResult.strip()
252 for tagName
in response.tags:
256 elif isinstance(self.
tagsList, list):
292 elif isinstance(self.
tagsList, list):
295 for tagName
in tagsList:
310 if response
is not None:
311 for tagName, tagValue
in response.tags.items():
315 log.debug(
"Summary lang '%s' was extracted from field '%s'", str(summaryLang), str(tagName))
def __isExistValue(self, src, val)
def __langMapping(self, lang)
def __retTagsText(self, tagName, response)
string TAGS_EXTENDED_VALUE_SUMMARY
def process(self, response, log=None)
list DEFAULT_VALUE_OPTION_TAGS
string PROPERTY_OPTION_PREFIX
def __makeTagName(self, tagName)
string DEFAULT_VALUE_LANG_MAPPING
string PROPERTY_OPTION_MAPS
string DEFAULT_VALUE_SUMMARY_LANG
string DEFAULT_VALUE_OPTION_PREFIX
def __truncateBuffer(self, text, log=None)
def getSummaryLang(self, response, log=None)
def __setLangField(self, text, tagName, fieldName, response, log=None)
string DEFAULT_VALUE_OPTION_SUFFIX
string TAGS_EXTENDED_VALUE_ALL
string PROPERTY_OPTION_SUFFIX
def __init__(self, scraperLangDetectProperty)
list DEFAULT_VALUE_TAGS_NAMES
def langDetect(incomeBuf, convertToFullName=True, log=None)
int DEFAULT_VALUE_OPTION_SIZE
def getLangTagsNames(self)
string PROPERTY_OPTION_TAGS
string SCRAPER_RESULT_TAG_OPTION_LANG
string SCRAPER_RESULT_TAG_OPTION_SUMMARY_LANG
dictionary DEFAULT_VALUE_OPTION_MAPS
string PROPERTY_OPTION_SIZE