HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
ScraperLangDetector.py
Go to the documentation of this file.
1 # coding: utf-8
2 """
3 HCE project, Python bindings, Distributed Tasks Manager application.
4 ScraperLangDetector Class content main functional detect lang.
5 
6 @package: dc_processor
7 @file ScraperLangDetector.py
8 @author Alexander Vybornyh <alexander.hce.cluster@gmail.com>
9 @link: http://hierarchical-cluster-engine.com/
10 @copyright: Copyright &copy; 2013-2017 IOIX Ukraine
11 @license: http://hierarchical-cluster-engine.com/license/
12 @since: 0.1
13 """
14 
15 import app.Utils as Utils
16 import dc_processor.Constants as CONSTS
17 
18 
19 
20 class ScraperLangDetector(object):
21  # # Constants used in class
22  MSG_ERROR_LANG_DETECT = "Language detection failed. Error: %s"
23 
24 
25  # # Properties options constants
26  PROPERTY_OPTION_PREFIX = "prefix"
27  PROPERTY_OPTION_SUFFIX = "suffix"
28  PROPERTY_OPTION_TAGS = "tags"
29  PROPERTY_OPTION_MAPS = "maps"
30  PROPERTY_OPTION_SIZE = "size"
31 
32  DEFAULT_VALUE_OPTION_PREFIX = ""
33  DEFAULT_VALUE_OPTION_SUFFIX = "_lang"
34  DEFAULT_VALUE_OPTION_TAGS = []
35  DEFAULT_VALUE_OPTION_MAPS = { "en": [ "fr", "nl", "ro", "af", "ca", "it", "da", "tl", "et", "cy", "sv", "id", "es", "*" ], \
36  "ja": [ "ja", "zh", "za" ], \
37  "ru": [ "ru", "uk" ], \
38  "pl": [ "pl" ], \
39  "de": [ "de" ] }
40  DEFAULT_VALUE_OPTION_SIZE = 1024
41  DEFAULT_VALUE_SUMMARY_LANG = "en"
42 
43  DEFAULT_VALUE_LANG_MAPPING = '*'
44 
45  DEFAULT_VALUE_TAGS_NAMES = [CONSTS.TAG_MEDIA, CONSTS.TAG_TITLE, CONSTS.TAG_LINK, CONSTS.TAG_DESCRIPTION, CONSTS.TAG_PUB_DATE, CONSTS.TAG_DC_DATE, \
46  CONSTS.TAG_AUTHOR, CONSTS.TAG_CONTENT_UTF8_ENCODED, CONSTS.TAG_KEYWORDS]
47 
48  TAGS_EXTENDED_VALUE_ALL = "*"
49  TAGS_EXTENDED_VALUE_SUMMARY = "&"
50 
51  SCRAPER_RESULT_TAG_OPTION_DATA = "data"
52  SCRAPER_RESULT_TAG_OPTION_LANG = "lang"
53  SCRAPER_RESULT_TAG_OPTION_SUMMARY_LANG = "summary_lang"
54 
55  # # Initialization
56  def __init__(self, scraperLangDetectProperty):
62  self.detectedLangs = {}
63 
64  if isinstance(scraperLangDetectProperty, dict):
65  # set prefix options
66  if self.PROPERTY_OPTION_PREFIX in scraperLangDetectProperty:
67  self.prefix = scraperLangDetectProperty[self.PROPERTY_OPTION_PREFIX]
68 
69  # set suffix options
70  if self.PROPERTY_OPTION_SUFFIX in scraperLangDetectProperty:
71  self.suffix = scraperLangDetectProperty[self.PROPERTY_OPTION_SUFFIX]
72 
73  # set tags options
74  if self.PROPERTY_OPTION_TAGS in scraperLangDetectProperty:
75  self.tagsList = scraperLangDetectProperty[self.PROPERTY_OPTION_TAGS]
76 
77  # set maps options
78  if self.PROPERTY_OPTION_MAPS in scraperLangDetectProperty:
79  self.maps = scraperLangDetectProperty[self.PROPERTY_OPTION_MAPS]
80 
81  # set size options
82  if self.PROPERTY_OPTION_SIZE in scraperLangDetectProperty:
83  self.size = int(scraperLangDetectProperty[self.PROPERTY_OPTION_SIZE])
84 
85 
86  # # Make tag name use prefix and suffix
87  #
88  # @param tagName - tag name
89  # @return result full tag name
90  def __makeTagName(self, tagName):
91  return self.prefix + tagName + self.suffix
92 
93 
94  # # lang detect
95  #
96  # @param incomeBuf - income buffer data
97  # @param convertToFullName - boolean flag convert to full name
98  # @param log - logger instance
99  # @return detected lang as string or None otherwise
100  @staticmethod
101  def langDetect(incomeBuf, convertToFullName=True, log=None):
102  ret = None
103 
104  if incomeBuf is not None and incomeBuf != "":
105  try:
106  from langdetect import detect
107  ret = detect(incomeBuf.decode('utf-8')).replace('-', ',')
108  except Exception, err:
109  if log is not None:
110  log.error(ScraperLangDetector.MSG_ERROR_LANG_DETECT, str(err))
111  log.debug(Utils.getTracebackInfo())
112 
113  return ret
114 
115 
116  # # extract tags text data
117  #
118  # @param tagName - tag name
119  # @param response - scraper result instance
120  # @return text data for tag name
121  def __retTagsText(self, tagName, response):
122  # variable for result
123  ret = None
124 
125  if response is not None and tagName in response.tags:
126  if isinstance(response.tags[tagName], basestring):
127  ret = response.tags[tagName]
128 
129  elif isinstance(response.tags[tagName], dict) and \
130  ScraperLangDetector.SCRAPER_RESULT_TAG_OPTION_DATA in response.tags[tagName]:
131  if isinstance(response.tags[tagName][ScraperLangDetector.SCRAPER_RESULT_TAG_OPTION_DATA], basestring):
132  ret = response.tags[tagName][ScraperLangDetector.SCRAPER_RESULT_TAG_OPTION_DATA]
133 
134  elif isinstance(response.tags[tagName][ScraperLangDetector.SCRAPER_RESULT_TAG_OPTION_DATA], list):
135  ret = ""
136  for elem in response.tags[tagName][ScraperLangDetector.SCRAPER_RESULT_TAG_OPTION_DATA]:
137  ret += elem
138  ret += ' '
139  ret = ret.strip()
140 
141  return ret
142 
143 
144  # # truncate buffer
145  #
146  # @param text - text buffer
147  # @param log - logger instance
148  # @return trancated text buffer
149  def __truncateBuffer(self, text, log=None):
150  # variable for result
151  buff = text if len(text) <= self.size else text[:self.size]
152  while len(buff) > 0:
153  try:
154  buff.decode('utf-8')
155  break
156  except Exception:
157  buff = buff[:-1]
158 
159  if log is not None:
160  log.debug("buffer len = %s was trancated to len = %s used limit = %s", str(len(text)), str(len(buff)), str(self.size))
161 
162  return buff
163 
164 
165  # # set language field in tags
166  #
167  # @param text - text buffer
168  # @param tagName - tag name
169  # @param fieldName - field name
170  # @param response - scraper result instance
171  # @param log - logger instance
172  # @return - None
173  def __setLangField(self, text, tagName, fieldName, response, log=None):
174 
175  if text is not None:
176  # truncate buffer by limit size
177  text = self.__truncateBuffer(text, log)
178 
179  # detect language
180  lang = ScraperLangDetector.langDetect(text, False, log)
181  if log is not None:
182  log.debug("for '%s' was detected '%s'", str(tagName), str(lang))
183 
184  if lang is not None and isinstance(response.tags[tagName], dict):
185  lang = self.__langMapping(lang)
186  response.tags[tagName][fieldName] = lang
187  self.detectedLangs[tagName] = lang
188 
189 
190  # # language mapping
191  #
192  # @param lang - language for mapping
193  # @return language
194  def __langMapping(self, lang):
195  # variable for result
196  ret = lang
197 
198  isExist = False
199  for key, value in self.maps.items():
200  isExist = self.__isExistValue(value, lang)
201  if isExist:
202  ret = key
203  break
204 
205  if not isExist:
206  default = None
207  for key, value in self.maps.items():
208  if self.__isExistValue(value, self.DEFAULT_VALUE_LANG_MAPPING):
209  default = key
210 
211  if default is not None:
212  ret = default
213 
214  return ret
215 
216 
217  # # check exist value
218  #
219  # @param src - source list of strings for search
220  # @param val - value for search
221  # @return True if exist or False otherwise
222  def __isExistValue(self, src, val):
223  return len([s for s in src if val in s]) > 0
224 
225 
226  # # main processing
227  #
228  # @param response - scraper result instance
229  # @param log - logger instance
230  # @return - None
231  def process(self, response, log=None):
232  if response is not None:
233  # use all tags
234  if isinstance(self.tagsList, basestring) and self.tagsList == self.TAGS_EXTENDED_VALUE_ALL or \
235  isinstance(self.tagsList, list) and self.TAGS_EXTENDED_VALUE_ALL in self.tagsList:
236  for tagName in response.tags:
237  localTextValue = self.__retTagsText(tagName, response)
238  self.__setLangField(localTextValue, tagName, self.SCRAPER_RESULT_TAG_OPTION_LANG, response, log)
239 
240  # use summary tags
241  elif isinstance(self.tagsList, basestring) and self.tagsList == self.TAGS_EXTENDED_VALUE_SUMMARY or \
242  isinstance(self.tagsList, list) and self.TAGS_EXTENDED_VALUE_SUMMARY in self.tagsList:
243  localTextResult = None
244  for tagName in response.tags:
245  localTextResult = ""
246  localTextValue = self.__retTagsText(tagName, response)
247  if localTextValue is not None:
248  localTextResult += localTextValue
249  localTextResult += ' '
250  localTextResult = localTextResult.strip()
251 
252  for tagName in response.tags:
253  self.__setLangField(localTextResult, tagName, self.SCRAPER_RESULT_TAG_OPTION_SUMMARY_LANG, response, log)
254 
255  # use list tags
256  elif isinstance(self.tagsList, list):
257  for tagName in self.tagsList:
258  localTextValue = self.__retTagsText(tagName, response)
259  self.__setLangField(localTextValue, tagName, self.SCRAPER_RESULT_TAG_OPTION_LANG, response, log)
260 
261 
262  # # get detected lang for tags
263  #
264  # @param - None
265  # @return dictionary lang tags and their values
266  def getLangTags(self):
267  # variable for result
268  langTagsDict = {}
269 
270  for tagName, lang in self.detectedLangs.items():
271  langTagsDict[self.__makeTagName(tagName)] = lang
272 
273  return langTagsDict
274 
275 
276  # # get lang tags names
277  #
278  # @param - None
279  # @return list of the lang tags names
280  def getLangTagsNames(self):
281  # variable for result
282  langTagsNames = []
283 
284  tagsList = []
285 
286  if isinstance(self.tagsList, basestring) and self.tagsList == self.TAGS_EXTENDED_VALUE_ALL or \
287  isinstance(self.tagsList, list) and self.TAGS_EXTENDED_VALUE_ALL in self.tagsList or \
288  isinstance(self.tagsList, basestring) and self.tagsList == self.TAGS_EXTENDED_VALUE_SUMMARY or \
289  isinstance(self.tagsList, list) and self.TAGS_EXTENDED_VALUE_SUMMARY in self.tagsList:
290  tagsList = self.DEFAULT_VALUE_TAGS_NAMES
291 
292  elif isinstance(self.tagsList, list):
293  tagsList = self.tagsList
294 
295  for tagName in tagsList:
296  langTagsNames.append(self.__makeTagName(tagName))
297 
298  return langTagsNames
299 
300 
301  # # get summary lang
302  #
303  # @param response - scraper result instance
304  # @param log - logger instance
305  # @return summary lang value as string
306  def getSummaryLang(self, response, log=None):
307  #variable for result
308  summaryLang = self.DEFAULT_VALUE_SUMMARY_LANG
309 
310  if response is not None:
311  for tagName, tagValue in response.tags.items():
312  if isinstance(tagValue, dict) and self.SCRAPER_RESULT_TAG_OPTION_SUMMARY_LANG in tagValue:
313  summaryLang = tagValue[self.SCRAPER_RESULT_TAG_OPTION_SUMMARY_LANG]
314  if log is not None:
315  log.debug("Summary lang '%s' was extracted from field '%s'", str(summaryLang), str(tagName))
316  break
317 
318  return summaryLang
def __setLangField(self, text, tagName, fieldName, response, log=None)
def langDetect(incomeBuf, convertToFullName=True, log=None)