HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
dc_processor.ScraperLangDetector.ScraperLangDetector Class Reference
Inheritance diagram for dc_processor.ScraperLangDetector.ScraperLangDetector:
Collaboration diagram for dc_processor.ScraperLangDetector.ScraperLangDetector:

Public Member Functions

def __init__ (self, scraperLangDetectProperty)
 
def process (self, response, log=None)
 
def getLangTags (self)
 
def getLangTagsNames (self)
 
def getSummaryLang (self, response, log=None)
 

Static Public Member Functions

def langDetect (incomeBuf, convertToFullName=True, log=None)
 

Public Attributes

 prefix
 
 suffix
 
 tagsList
 
 maps
 
 size
 
 detectedLangs
 

Static Public Attributes

string MSG_ERROR_LANG_DETECT = "Language detection failed. Error: %s"
 
string PROPERTY_OPTION_PREFIX = "prefix"
 
string PROPERTY_OPTION_SUFFIX = "suffix"
 
string PROPERTY_OPTION_TAGS = "tags"
 
string PROPERTY_OPTION_MAPS = "maps"
 
string PROPERTY_OPTION_SIZE = "size"
 
string DEFAULT_VALUE_OPTION_PREFIX = ""
 
string DEFAULT_VALUE_OPTION_SUFFIX = "_lang"
 
list DEFAULT_VALUE_OPTION_TAGS = []
 
dictionary DEFAULT_VALUE_OPTION_MAPS
 
int DEFAULT_VALUE_OPTION_SIZE = 1024
 
string DEFAULT_VALUE_SUMMARY_LANG = "en"
 
string DEFAULT_VALUE_LANG_MAPPING = '*'
 
list DEFAULT_VALUE_TAGS_NAMES
 
string TAGS_EXTENDED_VALUE_ALL = "*"
 
string TAGS_EXTENDED_VALUE_SUMMARY = "&"
 
string SCRAPER_RESULT_TAG_OPTION_DATA = "data"
 
string SCRAPER_RESULT_TAG_OPTION_LANG = "lang"
 
string SCRAPER_RESULT_TAG_OPTION_SUMMARY_LANG = "summary_lang"
 

Private Member Functions

def __makeTagName (self, tagName)
 
def __retTagsText (self, tagName, response)
 
def __truncateBuffer (self, text, log=None)
 
def __setLangField (self, text, tagName, fieldName, response, log=None)
 
def __langMapping (self, lang)
 
def __isExistValue (self, src, val)
 

Detailed Description

Definition at line 20 of file ScraperLangDetector.py.

Constructor & Destructor Documentation

◆ __init__()

def dc_processor.ScraperLangDetector.ScraperLangDetector.__init__ (   self,
  scraperLangDetectProperty 
)

Definition at line 56 of file ScraperLangDetector.py.

56  def __init__(self, scraperLangDetectProperty):
57  self.prefix = self.DEFAULT_VALUE_OPTION_PREFIX
58  self.suffix = self.DEFAULT_VALUE_OPTION_SUFFIX
59  self.tagsList = self.DEFAULT_VALUE_OPTION_TAGS
60  self.maps = self.DEFAULT_VALUE_OPTION_MAPS
61  self.size = self.DEFAULT_VALUE_OPTION_SIZE
62  self.detectedLangs = {}
63 
64  if isinstance(scraperLangDetectProperty, dict):
65  # set prefix options
66  if self.PROPERTY_OPTION_PREFIX in scraperLangDetectProperty:
67  self.prefix = scraperLangDetectProperty[self.PROPERTY_OPTION_PREFIX]
68 
69  # set suffix options
70  if self.PROPERTY_OPTION_SUFFIX in scraperLangDetectProperty:
71  self.suffix = scraperLangDetectProperty[self.PROPERTY_OPTION_SUFFIX]
72 
73  # set tags options
74  if self.PROPERTY_OPTION_TAGS in scraperLangDetectProperty:
75  self.tagsList = scraperLangDetectProperty[self.PROPERTY_OPTION_TAGS]
76 
77  # set maps options
78  if self.PROPERTY_OPTION_MAPS in scraperLangDetectProperty:
79  self.maps = scraperLangDetectProperty[self.PROPERTY_OPTION_MAPS]
80 
81  # set size options
82  if self.PROPERTY_OPTION_SIZE in scraperLangDetectProperty:
83  self.size = int(scraperLangDetectProperty[self.PROPERTY_OPTION_SIZE])
84 
85 
def __init__(self)
constructor
Definition: UIDGenerator.py:19

Member Function Documentation

◆ __isExistValue()

def dc_processor.ScraperLangDetector.ScraperLangDetector.__isExistValue (   self,
  src,
  val 
)
private

Definition at line 222 of file ScraperLangDetector.py.

222  def __isExistValue(self, src, val):
223  return len([s for s in src if val in s]) > 0
224 
225 
Here is the caller graph for this function:

◆ __langMapping()

def dc_processor.ScraperLangDetector.ScraperLangDetector.__langMapping (   self,
  lang 
)
private

Definition at line 194 of file ScraperLangDetector.py.

194  def __langMapping(self, lang):
195  # variable for result
196  ret = lang
197 
198  isExist = False
199  for key, value in self.maps.items():
200  isExist = self.__isExistValue(value, lang)
201  if isExist:
202  ret = key
203  break
204 
205  if not isExist:
206  default = None
207  for key, value in self.maps.items():
208  if self.__isExistValue(value, self.DEFAULT_VALUE_LANG_MAPPING):
209  default = key
210 
211  if default is not None:
212  ret = default
213 
214  return ret
215 
216 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ __makeTagName()

def dc_processor.ScraperLangDetector.ScraperLangDetector.__makeTagName (   self,
  tagName 
)
private

Definition at line 90 of file ScraperLangDetector.py.

90  def __makeTagName(self, tagName):
91  return self.prefix + tagName + self.suffix
92 
93 
Here is the caller graph for this function:

◆ __retTagsText()

def dc_processor.ScraperLangDetector.ScraperLangDetector.__retTagsText (   self,
  tagName,
  response 
)
private

Definition at line 121 of file ScraperLangDetector.py.

121  def __retTagsText(self, tagName, response):
122  # variable for result
123  ret = None
124 
125  if response is not None and tagName in response.tags:
126  if isinstance(response.tags[tagName], basestring):
127  ret = response.tags[tagName]
128 
129  elif isinstance(response.tags[tagName], dict) and \
130  ScraperLangDetector.SCRAPER_RESULT_TAG_OPTION_DATA in response.tags[tagName]:
131  if isinstance(response.tags[tagName][ScraperLangDetector.SCRAPER_RESULT_TAG_OPTION_DATA], basestring):
132  ret = response.tags[tagName][ScraperLangDetector.SCRAPER_RESULT_TAG_OPTION_DATA]
133 
134  elif isinstance(response.tags[tagName][ScraperLangDetector.SCRAPER_RESULT_TAG_OPTION_DATA], list):
135  ret = ""
136  for elem in response.tags[tagName][ScraperLangDetector.SCRAPER_RESULT_TAG_OPTION_DATA]:
137  ret += elem
138  ret += ' '
139  ret = ret.strip()
140 
141  return ret
142 
143 
Here is the caller graph for this function:

◆ __setLangField()

def dc_processor.ScraperLangDetector.ScraperLangDetector.__setLangField (   self,
  text,
  tagName,
  fieldName,
  response,
  log = None 
)
private

Definition at line 173 of file ScraperLangDetector.py.

173  def __setLangField(self, text, tagName, fieldName, response, log=None):
174 
175  if text is not None:
176  # truncate buffer by limit size
177  text = self.__truncateBuffer(text, log)
178 
179  # detect language
180  lang = ScraperLangDetector.langDetect(text, False, log)
181  if log is not None:
182  log.debug("for '%s' was detected '%s'", str(tagName), str(lang))
183 
184  if lang is not None and isinstance(response.tags[tagName], dict):
185  lang = self.__langMapping(lang)
186  response.tags[tagName][fieldName] = lang
187  self.detectedLangs[tagName] = lang
188 
189 
Here is the call graph for this function:
Here is the caller graph for this function:

◆ __truncateBuffer()

def dc_processor.ScraperLangDetector.ScraperLangDetector.__truncateBuffer (   self,
  text,
  log = None 
)
private

Definition at line 149 of file ScraperLangDetector.py.

149  def __truncateBuffer(self, text, log=None):
150  # variable for result
151  buff = text if len(text) <= self.size else text[:self.size]
152  while len(buff) > 0:
153  try:
154  buff.decode('utf-8')
155  break
156  except Exception:
157  buff = buff[:-1]
158 
159  if log is not None:
160  log.debug("buffer len = %s was trancated to len = %s used limit = %s", str(len(text)), str(len(buff)), str(self.size))
161 
162  return buff
163 
164 
Here is the caller graph for this function:

◆ getLangTags()

def dc_processor.ScraperLangDetector.ScraperLangDetector.getLangTags (   self)

Definition at line 266 of file ScraperLangDetector.py.

266  def getLangTags(self):
267  # variable for result
268  langTagsDict = {}
269 
270  for tagName, lang in self.detectedLangs.items():
271  langTagsDict[self.__makeTagName(tagName)] = lang
272 
273  return langTagsDict
274 
275 
Here is the call graph for this function:

◆ getLangTagsNames()

def dc_processor.ScraperLangDetector.ScraperLangDetector.getLangTagsNames (   self)

Definition at line 280 of file ScraperLangDetector.py.

280  def getLangTagsNames(self):
281  # variable for result
282  langTagsNames = []
283 
284  tagsList = []
285 
286  if isinstance(self.tagsList, basestring) and self.tagsList == self.TAGS_EXTENDED_VALUE_ALL or \
287  isinstance(self.tagsList, list) and self.TAGS_EXTENDED_VALUE_ALL in self.tagsList or \
288  isinstance(self.tagsList, basestring) and self.tagsList == self.TAGS_EXTENDED_VALUE_SUMMARY or \
289  isinstance(self.tagsList, list) and self.TAGS_EXTENDED_VALUE_SUMMARY in self.tagsList:
290  tagsList = self.DEFAULT_VALUE_TAGS_NAMES
291 
292  elif isinstance(self.tagsList, list):
293  tagsList = self.tagsList
294 
295  for tagName in tagsList:
296  langTagsNames.append(self.__makeTagName(tagName))
297 
298  return langTagsNames
299 
300 
Here is the call graph for this function:

◆ getSummaryLang()

def dc_processor.ScraperLangDetector.ScraperLangDetector.getSummaryLang (   self,
  response,
  log = None 
)

Definition at line 306 of file ScraperLangDetector.py.

306  def getSummaryLang(self, response, log=None):
307  #variable for result
308  summaryLang = self.DEFAULT_VALUE_SUMMARY_LANG
309 
310  if response is not None:
311  for tagName, tagValue in response.tags.items():
312  if isinstance(tagValue, dict) and self.SCRAPER_RESULT_TAG_OPTION_SUMMARY_LANG in tagValue:
313  summaryLang = tagValue[self.SCRAPER_RESULT_TAG_OPTION_SUMMARY_LANG]
314  if log is not None:
315  log.debug("Summary lang '%s' was extracted from field '%s'", str(summaryLang), str(tagName))
316  break
317 
318  return summaryLang
319 

◆ langDetect()

def dc_processor.ScraperLangDetector.ScraperLangDetector.langDetect (   incomeBuf,
  convertToFullName = True,
  log = None 
)
static

Definition at line 101 of file ScraperLangDetector.py.

101  def langDetect(incomeBuf, convertToFullName=True, log=None):
102  ret = None
103 
104  if incomeBuf is not None and incomeBuf != "":
105  try:
106  from langdetect import detect
107  ret = detect(incomeBuf.decode('utf-8')).replace('-', ',')
108  except Exception, err:
109  if log is not None:
110  log.error(ScraperLangDetector.MSG_ERROR_LANG_DETECT, str(err))
111  log.debug(Utils.getTracebackInfo())
112 
113  return ret
114 
115 

◆ process()

def dc_processor.ScraperLangDetector.ScraperLangDetector.process (   self,
  response,
  log = None 
)

Definition at line 231 of file ScraperLangDetector.py.

231  def process(self, response, log=None):
232  if response is not None:
233  # use all tags
234  if isinstance(self.tagsList, basestring) and self.tagsList == self.TAGS_EXTENDED_VALUE_ALL or \
235  isinstance(self.tagsList, list) and self.TAGS_EXTENDED_VALUE_ALL in self.tagsList:
236  for tagName in response.tags:
237  localTextValue = self.__retTagsText(tagName, response)
238  self.__setLangField(localTextValue, tagName, self.SCRAPER_RESULT_TAG_OPTION_LANG, response, log)
239 
240  # use summary tags
241  elif isinstance(self.tagsList, basestring) and self.tagsList == self.TAGS_EXTENDED_VALUE_SUMMARY or \
242  isinstance(self.tagsList, list) and self.TAGS_EXTENDED_VALUE_SUMMARY in self.tagsList:
243  localTextResult = None
244  for tagName in response.tags:
245  localTextResult = ""
246  localTextValue = self.__retTagsText(tagName, response)
247  if localTextValue is not None:
248  localTextResult += localTextValue
249  localTextResult += ' '
250  localTextResult = localTextResult.strip()
251 
252  for tagName in response.tags:
253  self.__setLangField(localTextResult, tagName, self.SCRAPER_RESULT_TAG_OPTION_SUMMARY_LANG, response, log)
254 
255  # use list tags
256  elif isinstance(self.tagsList, list):
257  for tagName in self.tagsList:
258  localTextValue = self.__retTagsText(tagName, response)
259  self.__setLangField(localTextValue, tagName, self.SCRAPER_RESULT_TAG_OPTION_LANG, response, log)
260 
261 
Here is the call graph for this function:
Here is the caller graph for this function:

Member Data Documentation

◆ DEFAULT_VALUE_LANG_MAPPING

string dc_processor.ScraperLangDetector.ScraperLangDetector.DEFAULT_VALUE_LANG_MAPPING = '*'
static

Definition at line 43 of file ScraperLangDetector.py.

◆ DEFAULT_VALUE_OPTION_MAPS

dictionary dc_processor.ScraperLangDetector.ScraperLangDetector.DEFAULT_VALUE_OPTION_MAPS
static
Initial value:
= { "en": [ "fr", "nl", "ro", "af", "ca", "it", "da", "tl", "et", "cy", "sv", "id", "es", "*" ], \
"ja": [ "ja", "zh", "za" ], \
"ru": [ "ru", "uk" ], \
"pl": [ "pl" ], \
"de": [ "de" ] }

Definition at line 35 of file ScraperLangDetector.py.

◆ DEFAULT_VALUE_OPTION_PREFIX

string dc_processor.ScraperLangDetector.ScraperLangDetector.DEFAULT_VALUE_OPTION_PREFIX = ""
static

Definition at line 32 of file ScraperLangDetector.py.

◆ DEFAULT_VALUE_OPTION_SIZE

int dc_processor.ScraperLangDetector.ScraperLangDetector.DEFAULT_VALUE_OPTION_SIZE = 1024
static

Definition at line 40 of file ScraperLangDetector.py.

◆ DEFAULT_VALUE_OPTION_SUFFIX

string dc_processor.ScraperLangDetector.ScraperLangDetector.DEFAULT_VALUE_OPTION_SUFFIX = "_lang"
static

Definition at line 33 of file ScraperLangDetector.py.

◆ DEFAULT_VALUE_OPTION_TAGS

list dc_processor.ScraperLangDetector.ScraperLangDetector.DEFAULT_VALUE_OPTION_TAGS = []
static

Definition at line 34 of file ScraperLangDetector.py.

◆ DEFAULT_VALUE_SUMMARY_LANG

string dc_processor.ScraperLangDetector.ScraperLangDetector.DEFAULT_VALUE_SUMMARY_LANG = "en"
static

Definition at line 41 of file ScraperLangDetector.py.

◆ DEFAULT_VALUE_TAGS_NAMES

list dc_processor.ScraperLangDetector.ScraperLangDetector.DEFAULT_VALUE_TAGS_NAMES
static
Initial value:
= [CONSTS.TAG_MEDIA, CONSTS.TAG_TITLE, CONSTS.TAG_LINK, CONSTS.TAG_DESCRIPTION, CONSTS.TAG_PUB_DATE, CONSTS.TAG_DC_DATE, \
CONSTS.TAG_AUTHOR, CONSTS.TAG_CONTENT_UTF8_ENCODED, CONSTS.TAG_KEYWORDS]

Definition at line 45 of file ScraperLangDetector.py.

◆ detectedLangs

dc_processor.ScraperLangDetector.ScraperLangDetector.detectedLangs

Definition at line 62 of file ScraperLangDetector.py.

◆ maps

dc_processor.ScraperLangDetector.ScraperLangDetector.maps

Definition at line 60 of file ScraperLangDetector.py.

◆ MSG_ERROR_LANG_DETECT

string dc_processor.ScraperLangDetector.ScraperLangDetector.MSG_ERROR_LANG_DETECT = "Language detection failed. Error: %s"
static

Definition at line 22 of file ScraperLangDetector.py.

◆ prefix

dc_processor.ScraperLangDetector.ScraperLangDetector.prefix

Definition at line 57 of file ScraperLangDetector.py.

◆ PROPERTY_OPTION_MAPS

string dc_processor.ScraperLangDetector.ScraperLangDetector.PROPERTY_OPTION_MAPS = "maps"
static

Definition at line 29 of file ScraperLangDetector.py.

◆ PROPERTY_OPTION_PREFIX

string dc_processor.ScraperLangDetector.ScraperLangDetector.PROPERTY_OPTION_PREFIX = "prefix"
static

Definition at line 26 of file ScraperLangDetector.py.

◆ PROPERTY_OPTION_SIZE

string dc_processor.ScraperLangDetector.ScraperLangDetector.PROPERTY_OPTION_SIZE = "size"
static

Definition at line 30 of file ScraperLangDetector.py.

◆ PROPERTY_OPTION_SUFFIX

string dc_processor.ScraperLangDetector.ScraperLangDetector.PROPERTY_OPTION_SUFFIX = "suffix"
static

Definition at line 27 of file ScraperLangDetector.py.

◆ PROPERTY_OPTION_TAGS

string dc_processor.ScraperLangDetector.ScraperLangDetector.PROPERTY_OPTION_TAGS = "tags"
static

Definition at line 28 of file ScraperLangDetector.py.

◆ SCRAPER_RESULT_TAG_OPTION_DATA

string dc_processor.ScraperLangDetector.ScraperLangDetector.SCRAPER_RESULT_TAG_OPTION_DATA = "data"
static

Definition at line 51 of file ScraperLangDetector.py.

◆ SCRAPER_RESULT_TAG_OPTION_LANG

string dc_processor.ScraperLangDetector.ScraperLangDetector.SCRAPER_RESULT_TAG_OPTION_LANG = "lang"
static

Definition at line 52 of file ScraperLangDetector.py.

◆ SCRAPER_RESULT_TAG_OPTION_SUMMARY_LANG

string dc_processor.ScraperLangDetector.ScraperLangDetector.SCRAPER_RESULT_TAG_OPTION_SUMMARY_LANG = "summary_lang"
static

Definition at line 53 of file ScraperLangDetector.py.

◆ size

dc_processor.ScraperLangDetector.ScraperLangDetector.size

Definition at line 61 of file ScraperLangDetector.py.

◆ suffix

dc_processor.ScraperLangDetector.ScraperLangDetector.suffix

Definition at line 58 of file ScraperLangDetector.py.

◆ TAGS_EXTENDED_VALUE_ALL

string dc_processor.ScraperLangDetector.ScraperLangDetector.TAGS_EXTENDED_VALUE_ALL = "*"
static

Definition at line 48 of file ScraperLangDetector.py.

◆ TAGS_EXTENDED_VALUE_SUMMARY

string dc_processor.ScraperLangDetector.ScraperLangDetector.TAGS_EXTENDED_VALUE_SUMMARY = "&"
static

Definition at line 49 of file ScraperLangDetector.py.

◆ tagsList

dc_processor.ScraperLangDetector.ScraperLangDetector.tagsList

Definition at line 59 of file ScraperLangDetector.py.


The documentation for this class was generated from the following file: