HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
dc_processor.AuthorType.AuthorType Class Reference
Inheritance diagram for dc_processor.AuthorType.AuthorType:
Collaboration diagram for dc_processor.AuthorType.AuthorType:

Public Member Functions

def __init__ (self, confProp=None, procProp=None, dataString=None, logger=None)
 

Static Public Member Functions

def getDefaultProperties ()
 
def mergeProperties (confProp, procProp)
 
def checkDataStringLimits (propDict, dataString, logger=None)
 
def isGoodWord (word, minAllowedWordLength, logger=None)
 
def removeNoneAlpha (word)
 
def getPairNames (wordsList, minAllowedWordLength, cleanNoneAlpha=False, logger=None)
 
def extractAuthorName (wordsList, minAllowedWordLength, maxAllowedWordLength, logger=None)
 
def makeParsing (propDict, dataString, logger=None)
 
def parse (confProp, procProp, dataString, logger=None)
 

Public Attributes

 author
 

Static Public Attributes

string MAIN_TAG_NAME = 'author'
 
string MIN_WORDS_NAME = 'min_words'
 
string MAX_WORDS_NAME = 'max_words'
 
string MIN_BYTES_NAME = 'min_bytes'
 
string MAX_BYTES_NAME = 'max_bytes'
 
string MAX_CHARS_WORD_NAME = 'max_chars_word'
 
string CLEAN_NONE_ALPHA_NAME = 'clean_none_alpha'
 
string VALUE_NAME = 'value'
 
string MISMATCH_NAME = 'mismatch'
 
string UNDETECTED_NAME = 'undetected'
 
string MISMATCH_VALUE_EMPTY = 'empty'
 
string MISMATCH_VALUE_IGNORE = 'ignore'
 
string MISMATCH_VALUE_VALUE = 'value'
 
string MISMATCH_VALUE_PARSE = 'parse'
 
string UNDETECTED_VALUE_EMPTY = 'empty'
 
string UNDETECTED_VALUE_IGNORE = 'ignore'
 
string UNDETECTED_VALUE_VALUE = 'value'
 
int MIN_WORDS_DEFAULT_VALUE = 1
 
int MAX_WORDS_DEFAULT_VALUE = 8
 
int MIN_BYTES_DEFAULT_VALUE = 3
 
int MAX_BYTES_DEFAULT_VALUE = 32
 
int MAX_CHARS_WORD_DEFAULT_VALUE = MAX_BYTES_DEFAULT_VALUE
 
int CLEAN_NONE_ALPHA_DEFAULT_VALUE = 1
 
string MISMATCH_DEFAULT_VALUE = MISMATCH_VALUE_EMPTY
 
string UNDETECTED_DEFAULT_VALUE = UNDETECTED_VALUE_EMPTY
 
string ERROR_DATA_STRING_TYPE = 'Data string is not string.'
 
string ERROR_CONFIG_PROPERTY_TYPE = 'Config property type is wrong'
 
string ERROR_PROCESSOR_PROPERTY_TYPE = 'Processor property type is wrong'
 
string ERROR_MAIN_TAG_NAME = "Main tag name '" + str(MAIN_TAG_NAME) + "' not found"
 

Detailed Description

Definition at line 20 of file AuthorType.py.

Constructor & Destructor Documentation

◆ __init__()

def dc_processor.AuthorType.AuthorType.__init__ (   self,
  confProp = None,
  procProp = None,
  dataString = None,
  logger = None 
)

Definition at line 66 of file AuthorType.py.

66  def __init__(self, confProp=None, procProp=None, dataString=None, logger=None):
67  self.author = AuthorType.parse(confProp, procProp, dataString, logger)
68 
69 
def __init__(self)
constructor
Definition: UIDGenerator.py:19

Member Function Documentation

◆ checkDataStringLimits()

def dc_processor.AuthorType.AuthorType.checkDataStringLimits (   propDict,
  dataString,
  logger = None 
)
static

Definition at line 145 of file AuthorType.py.

145  def checkDataStringLimits(propDict, dataString, logger=None):
146  # variable for result
147  ret = False
148 
149  bytesCount = len(dataString)
150  wordsCount = 0
151  for word in dataString.split():
152  if len(word) >= int(propDict[AuthorType.MIN_BYTES_NAME]):
153  wordsCount += 1
154 
155  if logger is not None:
156  logger.debug('bytesCount = ' + str(bytesCount))
157  logger.debug('wordsCount = ' + str(wordsCount))
158 
159  # check limits
160  if bytesCount >= int(propDict[AuthorType.MIN_BYTES_NAME]) and \
161  bytesCount <= int(propDict[AuthorType.MAX_BYTES_NAME]) and \
162  wordsCount >= int(propDict[AuthorType.MIN_WORDS_NAME]) and \
163  wordsCount <= int(propDict[AuthorType.MAX_WORDS_NAME]):
164  ret = True
165 
166  return ret
167 
168 

◆ extractAuthorName()

def dc_processor.AuthorType.AuthorType.extractAuthorName (   wordsList,
  minAllowedWordLength,
  maxAllowedWordLength,
  logger = None 
)
static

Definition at line 264 of file AuthorType.py.

264  def extractAuthorName(wordsList, minAllowedWordLength, maxAllowedWordLength, logger=None):
265  # variable for result
266  ret = None
267 
268  for word in wordsList:
269  # first word with upper title
270  if AuthorType.isGoodWord(word, int(minAllowedWordLength)) and word != wordsList[0]:
271  ret = (AuthorType.removeNoneAlpha(word).strip())
272  if logger is not None:
273  logger.debug('Found first word with upper title: ' + str(ret))
274  break
275 
276  # extract from email
277  pos = word.find('@')
278  if pos > -1:
279  AuthorName = word[:pos]
280  if len(AuthorName) >= minAllowedWordLength and len(AuthorName) <= maxAllowedWordLength:
281  ret = AuthorName
282  if logger is not None:
283  logger.debug('Found author name in email: ' + str(ret))
284  break
285  else:
286  if logger is not None:
287  logger.debug("Candidate '" + str(AuthorName) + "' for extract from email didn't pass limits")
288 
289  # search two words was concatenated
290  if len(word) > minAllowedWordLength:
291  found = False
292  for index in range(0, len(word)):
293  if index > 0 and word[index - 1].isalpha() and word[index].isupper():
294  first = word[:index]
295  second = word[index:]
296  if AuthorType.isGoodWord(first, int(minAllowedWordLength)) and \
297  AuthorType.isGoodWord(second, int(minAllowedWordLength)):
298  ret = first + ' ' + second
299  if logger is not None:
300  logger.debug('Found author name from two concatinated words: ' + str(ret))
301  found = True
302  else:
303  if logger is not None:
304  logger.debug("Candidate '" + str(word) + \
305  "' for extract from two concatinated words didn't pass validate")
306  break
307  if found:
308  break
309 
310  # search nickname
311  if word.find('_') > -1:
312  wd = word.split('_')
313  if len(wd) > 0:
314  ret = wd[0]
315  if len(wd) > 1:
316  ret += (' ' + AuthorType.removeNoneAlpha(wd[1]).split()[0])
317  if logger is not None:
318  logger.debug('Found author name from nickname: ' + str(ret))
319  break
320 
321  return ret
322 
323 

◆ getDefaultProperties()

def dc_processor.AuthorType.AuthorType.getDefaultProperties ( )
static

Definition at line 75 of file AuthorType.py.

75  def getDefaultProperties():
76  # variable for result
77  propDict = {}
78  # initialization use default values
79  propDict[AuthorType.MIN_WORDS_NAME] = AuthorType.MIN_WORDS_DEFAULT_VALUE
80  propDict[AuthorType.MAX_WORDS_NAME] = AuthorType.MAX_WORDS_DEFAULT_VALUE
81  propDict[AuthorType.MIN_BYTES_NAME] = AuthorType.MIN_BYTES_DEFAULT_VALUE
82  propDict[AuthorType.MAX_BYTES_NAME] = AuthorType.MAX_BYTES_DEFAULT_VALUE
83  propDict[AuthorType.MAX_CHARS_WORD_NAME] = AuthorType.MAX_CHARS_WORD_DEFAULT_VALUE
84  propDict[AuthorType.CLEAN_NONE_ALPHA_NAME] = AuthorType.CLEAN_NONE_ALPHA_DEFAULT_VALUE
85  propDict[AuthorType.MISMATCH_NAME] = AuthorType.MISMATCH_DEFAULT_VALUE
86  propDict[AuthorType.UNDETECTED_NAME] = AuthorType.UNDETECTED_DEFAULT_VALUE
87 
88  return propDict
89 
90 

◆ getPairNames()

def dc_processor.AuthorType.AuthorType.getPairNames (   wordsList,
  minAllowedWordLength,
  cleanNoneAlpha = False,
  logger = None 
)
static

Definition at line 218 of file AuthorType.py.

218  def getPairNames(wordsList, minAllowedWordLength, cleanNoneAlpha=False, logger=None):
219  # variable for result
220  ret = None
221  first = second = ''
222  for index in range(0, len(wordsList)):
223  if index < len(wordsList) - 1:
224  if logger is not None:
225  logger.debug('cleanNoneAlpha: ' + str(cleanNoneAlpha))
226 
227  if cleanNoneAlpha:
228  first = AuthorType.removeNoneAlpha(wordsList[index])
229  second = AuthorType.removeNoneAlpha(wordsList[index + 1])
230 
231  firstList = first.split()
232  if len(firstList) > 0:
233  first = firstList[-1]
234 
235  secondList = second.split()
236  if len(secondList) > 0:
237  second = secondList[0]
238 
239  else:
240  first = wordsList[index]
241  second = wordsList[index + 1]
242 
243  if logger is not None:
244  logger.debug('first: ' + str(first) + ' second: ' + str(second))
245 
246  if (AuthorType.isGoodWord(first, minAllowedWordLength, logger) and \
247  AuthorType.isGoodWord(second, minAllowedWordLength, logger)) or \
248  (AuthorType.isGoodWord(first, minAllowedWordLength, logger) and second.isupper()) or \
249  (first.isupper() and AuthorType.isGoodWord(second, minAllowedWordLength, logger)):
250  ret = first + ' ' + second
251  break
252 
253  return ret
254 
255 

◆ isGoodWord()

def dc_processor.AuthorType.AuthorType.isGoodWord (   word,
  minAllowedWordLength,
  logger = None 
)
static

Definition at line 176 of file AuthorType.py.

176  def isGoodWord(word, minAllowedWordLength, logger=None):
177 
178  if logger is not None:
179  logger.debug('word: ' + str(word) + ' minAllowedWordLength = ' + str(minAllowedWordLength))
180  logger.debug('word.istitle(): ' + str(bool(unicode(word, 'utf-8').istitle())))
181 
182  # variable for result
183  ret = False
184  if len(word) >= minAllowedWordLength:
185  if unicode(word, 'utf-8').istitle():
186  ret = True
187 
188  if logger is not None:
189  logger.debug('ret = ' + str(ret))
190 
191  return ret
192 
193 

◆ makeParsing()

def dc_processor.AuthorType.AuthorType.makeParsing (   propDict,
  dataString,
  logger = None 
)
static

Definition at line 331 of file AuthorType.py.

331  def makeParsing(propDict, dataString, logger=None):
332  wordsList = dataString.split()
333 
334  # Search pair: name, surname
335  ret = AuthorType.getPairNames(wordsList, int(propDict[AuthorType.MIN_BYTES_NAME]), False, None)
336  if logger is not None:
337  logger.debug('Search author as pair words: ' + str(ret))
338 
339  if ret is None and bool(propDict[AuthorType.CLEAN_NONE_ALPHA_NAME]):
340  ret = AuthorType.getPairNames(wordsList, int(propDict[AuthorType.MIN_BYTES_NAME]), True)
341  if logger is not None:
342  logger.debug('Search author as pair words after clean not alpha: ' + str(ret))
343 
344  if ret is None:
345  ret = AuthorType.extractAuthorName(wordsList, int(propDict[AuthorType.MIN_BYTES_NAME]),
346  int(propDict[AuthorType.MAX_CHARS_WORD_NAME]), logger)
347  if logger is not None:
348  logger.debug('makeParsing return: ' + str(ret))
349 
350  return ret
351 
352 

◆ mergeProperties()

def dc_processor.AuthorType.AuthorType.mergeProperties (   confProp,
  procProp 
)
static

Definition at line 96 of file AuthorType.py.

96  def mergeProperties(confProp, procProp):
97 
98  if confProp is not None and not (isinstance(confProp, str) or isinstance(confProp, unicode) or\
99  isinstance(confProp, dict)):
100  raise Exception(AuthorType.ERROR_CONFIG_PROPERTY_TYPE + ': ' + str(type(confProp)))
101 
102  if procProp is not None and not (isinstance(procProp, str) or isinstance(procProp, unicode) or\
103  isinstance(procProp, dict)):
104  raise Exception(AuthorType.ERROR_PROCESSOR_PROPERTY_TYPE + ': ' + str(type(procProp)))
105 
106  # variable for result
107  propDict = AuthorType.getDefaultProperties()
108 
109  # update variables from config file
110  confPropDict = {}
111  if confProp is not None:
112  if not isinstance(confProp, dict):
113  confPropDict = json.loads(confProp)
114  else:
115  confPropDict = confProp
116 
117  if not confPropDict.has_key(AuthorType.MAIN_TAG_NAME):
118  raise Exception(AuthorType.ERROR_MAIN_TAG_NAME)
119 
120  propDict.update(confPropDict[AuthorType.MAIN_TAG_NAME])
121 
122  # update variables from PROCESSOR_PROPERTIES
123  procPropDict = {}
124  if procProp is not None:
125  if not isinstance(procProp, dict):
126  procPropDict = json.loads(procProp)
127  else:
128  procPropDict = procProp
129 
130  if not procPropDict.has_key(AuthorType.MAIN_TAG_NAME):
131  raise Exception(AuthorType.ERROR_MAIN_TAG_NAME)
132 
133  propDict.update(procPropDict[AuthorType.MAIN_TAG_NAME])
134 
135  return propDict
136 
137 

◆ parse()

def dc_processor.AuthorType.AuthorType.parse (   confProp,
  procProp,
  dataString,
  logger = None 
)
static

Definition at line 361 of file AuthorType.py.

361  def parse(confProp, procProp, dataString, logger=None):
362  # variable for result
363  ret = None
364  try:
365  if logger is not None:
366  logger.debug('input raw data to parse: ' + str(dataString))
367 
368  if not isinstance(dataString, str) and not isinstance(dataString, unicode):
369  raise Exception(AuthorType.ERROR_DATA_STRING_TYPE + ' type: ' + str(type(dataString)))
370 
371  propDict = AuthorType.mergeProperties(confProp, procProp)
372  if logger is not None:
373  logger.debug('merged properties: ' + str(propDict))
374 
375  isGood = AuthorType.checkDataStringLimits(propDict, dataString, logger)
376  if logger is not None:
377  logger.debug('isGood: ' + str(bool(isGood)))
378 
379  # check mismatch
380  if isGood:
381  ret = AuthorType.makeParsing(propDict, str(dataString), logger)
382  else:
383  if propDict[AuthorType.MISMATCH_NAME] == AuthorType.MISMATCH_VALUE_EMPTY:
384  ret = ''
385  elif propDict[AuthorType.MISMATCH_NAME] == AuthorType.MISMATCH_VALUE_IGNORE:
386  ret = dataString
387  elif propDict[AuthorType.MISMATCH_NAME] == AuthorType.MISMATCH_VALUE_VALUE:
388  ret = propDict[AuthorType.VALUE_NAME]
389  elif propDict[AuthorType.MISMATCH_NAME] == AuthorType.MISMATCH_VALUE_PARSE:
390  ret = AuthorType.makeParsing(propDict, dataString, logger)
391  else:
392  ret = ''
393 
394  # check undetected
395  if ret is None:
396  if propDict[AuthorType.UNDETECTED_NAME] == AuthorType.UNDETECTED_VALUE_EMPTY:
397  ret = ''
398  elif propDict[AuthorType.UNDETECTED_NAME] == AuthorType.UNDETECTED_VALUE_IGNORE:
399  ret = dataString
400  elif propDict[AuthorType.UNDETECTED_NAME] == AuthorType.UNDETECTED_VALUE_VALUE:
401  ret = propDict[AuthorType.VALUE_NAME]
402  else:
403  ret = ''
404 
405  except Exception, err:
406  if logger is not None:
407  logger.debug('Error: ' + str(err))
408 
409  return ret

◆ removeNoneAlpha()

def dc_processor.AuthorType.AuthorType.removeNoneAlpha (   word)
static

Definition at line 199 of file AuthorType.py.

199  def removeNoneAlpha(word):
200  wd = []
201  for s in word:
202  if s.isalpha():
203  wd.append(s)
204  else:
205  wd.append(' ')
206 
207  return ''.join(wd)
208 
209 
Definition: join.py:1

Member Data Documentation

◆ author

dc_processor.AuthorType.AuthorType.author

Definition at line 67 of file AuthorType.py.

◆ CLEAN_NONE_ALPHA_DEFAULT_VALUE

int dc_processor.AuthorType.AuthorType.CLEAN_NONE_ALPHA_DEFAULT_VALUE = 1
static

Definition at line 49 of file AuthorType.py.

◆ CLEAN_NONE_ALPHA_NAME

string dc_processor.AuthorType.AuthorType.CLEAN_NONE_ALPHA_NAME = 'clean_none_alpha'
static

Definition at line 30 of file AuthorType.py.

◆ ERROR_CONFIG_PROPERTY_TYPE

string dc_processor.AuthorType.AuthorType.ERROR_CONFIG_PROPERTY_TYPE = 'Config property type is wrong'
static

Definition at line 56 of file AuthorType.py.

◆ ERROR_DATA_STRING_TYPE

string dc_processor.AuthorType.AuthorType.ERROR_DATA_STRING_TYPE = 'Data string is not string.'
static

Definition at line 55 of file AuthorType.py.

◆ ERROR_MAIN_TAG_NAME

string dc_processor.AuthorType.AuthorType.ERROR_MAIN_TAG_NAME = "Main tag name '" + str(MAIN_TAG_NAME) + "' not found"
static

Definition at line 58 of file AuthorType.py.

◆ ERROR_PROCESSOR_PROPERTY_TYPE

string dc_processor.AuthorType.AuthorType.ERROR_PROCESSOR_PROPERTY_TYPE = 'Processor property type is wrong'
static

Definition at line 57 of file AuthorType.py.

◆ MAIN_TAG_NAME

string dc_processor.AuthorType.AuthorType.MAIN_TAG_NAME = 'author'
static

Definition at line 23 of file AuthorType.py.

◆ MAX_BYTES_DEFAULT_VALUE

int dc_processor.AuthorType.AuthorType.MAX_BYTES_DEFAULT_VALUE = 32
static

Definition at line 47 of file AuthorType.py.

◆ MAX_BYTES_NAME

string dc_processor.AuthorType.AuthorType.MAX_BYTES_NAME = 'max_bytes'
static

Definition at line 28 of file AuthorType.py.

◆ MAX_CHARS_WORD_DEFAULT_VALUE

int dc_processor.AuthorType.AuthorType.MAX_CHARS_WORD_DEFAULT_VALUE = MAX_BYTES_DEFAULT_VALUE
static

Definition at line 48 of file AuthorType.py.

◆ MAX_CHARS_WORD_NAME

string dc_processor.AuthorType.AuthorType.MAX_CHARS_WORD_NAME = 'max_chars_word'
static

Definition at line 29 of file AuthorType.py.

◆ MAX_WORDS_DEFAULT_VALUE

int dc_processor.AuthorType.AuthorType.MAX_WORDS_DEFAULT_VALUE = 8
static

Definition at line 45 of file AuthorType.py.

◆ MAX_WORDS_NAME

string dc_processor.AuthorType.AuthorType.MAX_WORDS_NAME = 'max_words'
static

Definition at line 26 of file AuthorType.py.

◆ MIN_BYTES_DEFAULT_VALUE

int dc_processor.AuthorType.AuthorType.MIN_BYTES_DEFAULT_VALUE = 3
static

Definition at line 46 of file AuthorType.py.

◆ MIN_BYTES_NAME

string dc_processor.AuthorType.AuthorType.MIN_BYTES_NAME = 'min_bytes'
static

Definition at line 27 of file AuthorType.py.

◆ MIN_WORDS_DEFAULT_VALUE

int dc_processor.AuthorType.AuthorType.MIN_WORDS_DEFAULT_VALUE = 1
static

Definition at line 44 of file AuthorType.py.

◆ MIN_WORDS_NAME

string dc_processor.AuthorType.AuthorType.MIN_WORDS_NAME = 'min_words'
static

Definition at line 25 of file AuthorType.py.

◆ MISMATCH_DEFAULT_VALUE

string dc_processor.AuthorType.AuthorType.MISMATCH_DEFAULT_VALUE = MISMATCH_VALUE_EMPTY
static

Definition at line 50 of file AuthorType.py.

◆ MISMATCH_NAME

string dc_processor.AuthorType.AuthorType.MISMATCH_NAME = 'mismatch'
static

Definition at line 32 of file AuthorType.py.

◆ MISMATCH_VALUE_EMPTY

string dc_processor.AuthorType.AuthorType.MISMATCH_VALUE_EMPTY = 'empty'
static

Definition at line 35 of file AuthorType.py.

◆ MISMATCH_VALUE_IGNORE

string dc_processor.AuthorType.AuthorType.MISMATCH_VALUE_IGNORE = 'ignore'
static

Definition at line 36 of file AuthorType.py.

◆ MISMATCH_VALUE_PARSE

string dc_processor.AuthorType.AuthorType.MISMATCH_VALUE_PARSE = 'parse'
static

Definition at line 38 of file AuthorType.py.

◆ MISMATCH_VALUE_VALUE

string dc_processor.AuthorType.AuthorType.MISMATCH_VALUE_VALUE = 'value'
static

Definition at line 37 of file AuthorType.py.

◆ UNDETECTED_DEFAULT_VALUE

string dc_processor.AuthorType.AuthorType.UNDETECTED_DEFAULT_VALUE = UNDETECTED_VALUE_EMPTY
static

Definition at line 51 of file AuthorType.py.

◆ UNDETECTED_NAME

string dc_processor.AuthorType.AuthorType.UNDETECTED_NAME = 'undetected'
static

Definition at line 33 of file AuthorType.py.

◆ UNDETECTED_VALUE_EMPTY

string dc_processor.AuthorType.AuthorType.UNDETECTED_VALUE_EMPTY = 'empty'
static

Definition at line 40 of file AuthorType.py.

◆ UNDETECTED_VALUE_IGNORE

string dc_processor.AuthorType.AuthorType.UNDETECTED_VALUE_IGNORE = 'ignore'
static

Definition at line 41 of file AuthorType.py.

◆ UNDETECTED_VALUE_VALUE

string dc_processor.AuthorType.AuthorType.UNDETECTED_VALUE_VALUE = 'value'
static

Definition at line 42 of file AuthorType.py.

◆ VALUE_NAME

string dc_processor.AuthorType.AuthorType.VALUE_NAME = 'value'
static

Definition at line 31 of file AuthorType.py.


The documentation for this class was generated from the following file: