HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
ftest_ScraperLangDetector.py
Go to the documentation of this file.
1 #!/usr/bin/python
2 # coding: utf-8
3 """
4 HCE project, Python bindings, Crawler application.
5 ScraperLangDetector tests.
6 
7 @package: dc
8 @file ftest_ScraperLangDetector.py
9 @author Alexander Vybornyh <alexander.hce.cluster@gmail.com>
10 @link: http://hierarchical-cluster-engine.com/
11 @copyright: Copyright &copy; 2017 IOIX Ukraine
12 @license: http://hierarchical-cluster-engine.com/license/
13 @since: 0.1
14 """
15 
16 import os
17 import sys
18 import logging
19 
20 from dc_processor.ScraperLangDetector import ScraperLangDetector
21 from dc_processor.scraper_result import Result
22 import dc_processor.Constants as CONSTS
23 from app.Utils import varDump
24 
25 def getLogger():
26  # create logger
27  log = logging.getLogger('test')
28  log.setLevel(logging.DEBUG)
29 
30  # create console handler and set level to debug
31  ch = logging.StreamHandler()
32  ch.setLevel(logging.DEBUG)
33 
34  # create formatter
35  formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s', '%Y-%m-%d %H:%M:%S')
36 
37  # add formatter to ch
38  ch.setFormatter(formatter)
39 
40  # add ch to logger
41  log.addHandler(ch)
42 
43  return log
44 
45 
47  # # Initialization
48  def __init__(self, properties, log):
49  self.properties = properties
50  self.logger = log
51 
52 
53  # # Test processing method
54  def run(self, response):
55 
56  langDetector = ScraperLangDetector(self.properties[CONSTS.LANG_PROP_NAME])
57  langDetector.process(response, self.logger)
58  langTagsDict = langDetector.getLangTags()
59  langTagsNames = langDetector.getLangTagsNames()
60  self.logger.debug("langTagsDict: %s", varDump(langTagsDict))
61  self.logger.debug("langTagsNames: %s", varDump(langTagsNames))
62  # self.logger.debug("response: %s", varDump(response, stringifyType=0))
63 
64 
65 if __name__ == '__main__':
66 
67  properties = {"SCRAPER_LANG_DETECT":{"prefix":"lang_", "suffix":"_lang", "tags":["title", "content_encoded", "description"]}}
68  properties = {"SCRAPER_LANG_DETECT":{"suffix":"_lang", "tags":["title", "content_encoded", "description"]}}
69  properties = {"SCRAPER_LANG_DETECT":{"suffix":"_lang", "tags":["content_encoded"], "maps":{"en":["fr", "es", "*"], "ja":["ja-123", "zh", "za"], "ru":["ru", "uk"], "pl":["pl"], "de":["de"]}, "size":100}}
70  properties = {"SCRAPER_LANG_DETECT":{"tags":["content_encoded"]}}
71 
72  response = Result(None, None)
73  response.tags['title'] = {'xpath': '', 'extractor': 'GooseExtractor', 'lang_suffix': '_language', 'lang': 'en', 'type': None, 'data': 'None of the victims really wanted to die', 'name': 'title'}
74  response.tags['content_encoded'] = {'xpath': '', 'extractor': 'GooseExtractor', 'lang_suffix': '_language', 'lang': 'en', 'type': None, 'data': '東京都江東区の路上で職業不詳太田智子さん(47)の遺体が見つかった事件で、警視庁捜査1課は3日、死体遺棄容疑で、交際相手の大田区職員上田一美容疑者(55)=大田区久が原=を逮捕した。「首をネクタイで絞めて殺し、死体を捨てた」と話しているといい、殺人容疑でも捜査する。', 'name': 'content_encoded'}
75 # response.tags['content_encoded'] = {'xpath': '', 'extractor': 'GooseExtractor', 'lang_suffix': '_language', 'lang': 'en', 'type': None, 'data': '東京', 'name': 'content_encoded'}
76 
77  testScraperLangDetector = TestScraperLangDetector(properties=properties, log=getLogger())
78  testScraperLangDetector.run(response)
79 
80 
81 
82 
83 
84 
85 
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410