4 HCE project, Python bindings, Crawler application. 5 ScraperLangDetector tests. 8 @file ftest_ScraperLangDetector.py 9 @author Alexander Vybornyh <alexander.hce.cluster@gmail.com> 10 @link: http://hierarchical-cluster-engine.com/ 11 @copyright: Copyright © 2017 IOIX Ukraine 12 @license: http://hierarchical-cluster-engine.com/license/ 27 log = logging.getLogger(
'test')
28 log.setLevel(logging.DEBUG)
31 ch = logging.StreamHandler()
32 ch.setLevel(logging.DEBUG)
35 formatter = logging.Formatter(
'%(asctime)s - %(levelname)s - %(message)s',
'%Y-%m-%d %H:%M:%S')
38 ch.setFormatter(formatter)
54 def run(self, response):
57 langDetector.process(response, self.
logger)
58 langTagsDict = langDetector.getLangTags()
59 langTagsNames = langDetector.getLangTagsNames()
61 self.
logger.debug(
"langTagsNames: %s",
varDump(langTagsNames))
65 if __name__ ==
'__main__':
67 properties = {
"SCRAPER_LANG_DETECT":{
"prefix":
"lang_",
"suffix":
"_lang",
"tags":[
"title",
"content_encoded",
"description"]}}
68 properties = {
"SCRAPER_LANG_DETECT":{
"suffix":
"_lang",
"tags":[
"title",
"content_encoded",
"description"]}}
69 properties = {
"SCRAPER_LANG_DETECT":{
"suffix":
"_lang",
"tags":[
"content_encoded"],
"maps":{
"en":[
"fr",
"es",
"*"],
"ja":[
"ja-123",
"zh",
"za"],
"ru":[
"ru",
"uk"],
"pl":[
"pl"],
"de":[
"de"]},
"size":100}}
70 properties = {
"SCRAPER_LANG_DETECT":{
"tags":[
"content_encoded"]}}
73 response.tags[
'title'] = {
'xpath':
'',
'extractor':
'GooseExtractor',
'lang_suffix':
'_language',
'lang':
'en',
'type':
None,
'data':
'None of the victims really wanted to die',
'name':
'title'}
74 response.tags[
'content_encoded'] = {
'xpath':
'',
'extractor':
'GooseExtractor',
'lang_suffix':
'_language',
'lang':
'en',
'type':
None,
'data':
'東京都江東区の路上で職業不詳太田智子さん(47)の遺体が見つかった事件で、警視庁捜査1課は3日、死体遺棄容疑で、交際相手の大田区職員上田一美容疑者(55)=大田区久が原=を逮捕した。「首をネクタイで絞めて殺し、死体を捨てた」と話しているといい、殺人容疑でも捜査する。',
'name':
'content_encoded'}
78 testScraperLangDetector.run(response)
def __init__(self, properties, log)
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)