HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
AuthorType.py
Go to the documentation of this file.
1 # coding: utf-8
2 """
3 HCE project, Python bindings, Distributed Tasks Manager application.
4 AuthorType Class content main functional extract of author data.
5 
6 @package: dc_processor
7 @file AuthorType.py
8 @author Alexander Vybornyh <alexander.hce.cluster@gmail.com>
9 @link: http://hierarchical-cluster-engine.com/
10 @copyright: Copyright &copy; 2013-2015 IOIX Ukraine
11 @license: http://hierarchical-cluster-engine.com/license/
12 @since: 0.1
13 """
14 
15 import json
16 
17 
18 # # Class AuthorType for extract author data.
19 #
20 class AuthorType(object):
21  # #Constans used in class
22  # Author tag name value
23  MAIN_TAG_NAME = 'author'
24  # Support options names
25  MIN_WORDS_NAME = 'min_words'
26  MAX_WORDS_NAME = 'max_words'
27  MIN_BYTES_NAME = 'min_bytes'
28  MAX_BYTES_NAME = 'max_bytes'
29  MAX_CHARS_WORD_NAME = 'max_chars_word'
30  CLEAN_NONE_ALPHA_NAME = 'clean_none_alpha'
31  VALUE_NAME = 'value'
32  MISMATCH_NAME = 'mismatch'
33  UNDETECTED_NAME = 'undetected'
34  # 'Mismatch' options values
35  MISMATCH_VALUE_EMPTY = 'empty'
36  MISMATCH_VALUE_IGNORE = 'ignore'
37  MISMATCH_VALUE_VALUE = 'value'
38  MISMATCH_VALUE_PARSE = 'parse'
39  # 'Undetected' options values
40  UNDETECTED_VALUE_EMPTY = 'empty'
41  UNDETECTED_VALUE_IGNORE = 'ignore'
42  UNDETECTED_VALUE_VALUE = 'value'
43  # Default options values
44  MIN_WORDS_DEFAULT_VALUE = 1
45  MAX_WORDS_DEFAULT_VALUE = 8
46  MIN_BYTES_DEFAULT_VALUE = 3
47  MAX_BYTES_DEFAULT_VALUE = 32
48  MAX_CHARS_WORD_DEFAULT_VALUE = MAX_BYTES_DEFAULT_VALUE
49  CLEAN_NONE_ALPHA_DEFAULT_VALUE = 1
50  MISMATCH_DEFAULT_VALUE = MISMATCH_VALUE_EMPTY
51  UNDETECTED_DEFAULT_VALUE = UNDETECTED_VALUE_EMPTY
52 
53  # #Constant of error messages
54  # ERROR_INPUT_PARAMS = 'Error initialization by input parameters.'
55  ERROR_DATA_STRING_TYPE = 'Data string is not string.'
56  ERROR_CONFIG_PROPERTY_TYPE = 'Config property type is wrong'
57  ERROR_PROCESSOR_PROPERTY_TYPE = 'Processor property type is wrong'
58  ERROR_MAIN_TAG_NAME = "Main tag name '" + str(MAIN_TAG_NAME) + "' not found"
59 
60  # #Constructor
61  #
62  # @param confProp - properties as JSON already read from config file
63  # @param procProp - properties as JSON from PROCESSOR_PROPERTIES
64  # @param dataString - string for extract
65  # @param logger - instance of logger for log if necessary
66  def __init__(self, confProp=None, procProp=None, dataString=None, logger=None):
67  self.author = AuthorType.parse(confProp, procProp, dataString, logger)
68 
69 
70  # #Get default properties
71  #
72  # @param - None
73  # @return prop - dictionary with default properties
74  @staticmethod
76  # variable for result
77  propDict = {}
78  # initialization use default values
79  propDict[AuthorType.MIN_WORDS_NAME] = AuthorType.MIN_WORDS_DEFAULT_VALUE
80  propDict[AuthorType.MAX_WORDS_NAME] = AuthorType.MAX_WORDS_DEFAULT_VALUE
81  propDict[AuthorType.MIN_BYTES_NAME] = AuthorType.MIN_BYTES_DEFAULT_VALUE
82  propDict[AuthorType.MAX_BYTES_NAME] = AuthorType.MAX_BYTES_DEFAULT_VALUE
83  propDict[AuthorType.MAX_CHARS_WORD_NAME] = AuthorType.MAX_CHARS_WORD_DEFAULT_VALUE
84  propDict[AuthorType.CLEAN_NONE_ALPHA_NAME] = AuthorType.CLEAN_NONE_ALPHA_DEFAULT_VALUE
85  propDict[AuthorType.MISMATCH_NAME] = AuthorType.MISMATCH_DEFAULT_VALUE
86  propDict[AuthorType.UNDETECTED_NAME] = AuthorType.UNDETECTED_DEFAULT_VALUE
87 
88  return propDict
89 
90 
91  # #Merge properties
92  #
93  # @param confProp - properties as JSON string already read from config file
94  # @param procProp - properties as JSON string from PROCESSOR_PROPERTIES
95  @staticmethod
96  def mergeProperties(confProp, procProp):
97 
98  if confProp is not None and not (isinstance(confProp, str) or isinstance(confProp, unicode) or\
99  isinstance(confProp, dict)):
100  raise Exception(AuthorType.ERROR_CONFIG_PROPERTY_TYPE + ': ' + str(type(confProp)))
101 
102  if procProp is not None and not (isinstance(procProp, str) or isinstance(procProp, unicode) or\
103  isinstance(procProp, dict)):
104  raise Exception(AuthorType.ERROR_PROCESSOR_PROPERTY_TYPE + ': ' + str(type(procProp)))
105 
106  # variable for result
107  propDict = AuthorType.getDefaultProperties()
108 
109  # update variables from config file
110  confPropDict = {}
111  if confProp is not None:
112  if not isinstance(confProp, dict):
113  confPropDict = json.loads(confProp)
114  else:
115  confPropDict = confProp
116 
117  if not confPropDict.has_key(AuthorType.MAIN_TAG_NAME):
118  raise Exception(AuthorType.ERROR_MAIN_TAG_NAME)
119 
120  propDict.update(confPropDict[AuthorType.MAIN_TAG_NAME])
121 
122  # update variables from PROCESSOR_PROPERTIES
123  procPropDict = {}
124  if procProp is not None:
125  if not isinstance(procProp, dict):
126  procPropDict = json.loads(procProp)
127  else:
128  procPropDict = procProp
129 
130  if not procPropDict.has_key(AuthorType.MAIN_TAG_NAME):
131  raise Exception(AuthorType.ERROR_MAIN_TAG_NAME)
132 
133  propDict.update(procPropDict[AuthorType.MAIN_TAG_NAME])
134 
135  return propDict
136 
137 
138  # #Check data string limits
139  #
140  # @param propDict - dictionary of properties
141  # @param dataString - data string for extract
142  # @param logger - instance of logger for log if necessary
143  # @return True - if allowed limits interval, otherwise False
144  @staticmethod
145  def checkDataStringLimits(propDict, dataString, logger=None):
146  # variable for result
147  ret = False
148 
149  bytesCount = len(dataString)
150  wordsCount = 0
151  for word in dataString.split():
152  if len(word) >= int(propDict[AuthorType.MIN_BYTES_NAME]):
153  wordsCount += 1
154 
155  if logger is not None:
156  logger.debug('bytesCount = ' + str(bytesCount))
157  logger.debug('wordsCount = ' + str(wordsCount))
158 
159  # check limits
160  if bytesCount >= int(propDict[AuthorType.MIN_BYTES_NAME]) and \
161  bytesCount <= int(propDict[AuthorType.MAX_BYTES_NAME]) and \
162  wordsCount >= int(propDict[AuthorType.MIN_WORDS_NAME]) and \
163  wordsCount <= int(propDict[AuthorType.MAX_WORDS_NAME]):
164  ret = True
165 
166  return ret
167 
168 
169  # #Check word is good ot not
170  #
171  # @param word - same word
172  # @param minAllowedWordLength - min allowed length of word
173  # @param logger - instance of logger for log if necessary
174  # @return True - if success, otherwise False
175  @staticmethod
176  def isGoodWord(word, minAllowedWordLength, logger=None):
177 
178  if logger is not None:
179  logger.debug('word: ' + str(word) + ' minAllowedWordLength = ' + str(minAllowedWordLength))
180  logger.debug('word.istitle(): ' + str(bool(unicode(word, 'utf-8').istitle())))
181 
182  # variable for result
183  ret = False
184  if len(word) >= minAllowedWordLength:
185  if unicode(word, 'utf-8').istitle():
186  ret = True
187 
188  if logger is not None:
189  logger.debug('ret = ' + str(ret))
190 
191  return ret
192 
193 
194  # #Remove none alpha from word
195  #
196  # @param word - input word
197  # @return word without not alpha simbols
198  @staticmethod
199  def removeNoneAlpha(word):
200  wd = []
201  for s in word:
202  if s.isalpha():
203  wd.append(s)
204  else:
205  wd.append(' ')
206 
207  return ''.join(wd)
208 
209 
210  # #Get pair names
211  #
212  # @param wordsList - words list for extract
213  # @param minAllowedWordLength - min allowed length of word
214  # @param cleanNoneAlpha - flag of clean none alpha simbols before analyze
215  # @param logger - instance of logger for log if necessary
216  # @return string of pair names if success, otherwise None
217  @staticmethod
218  def getPairNames(wordsList, minAllowedWordLength, cleanNoneAlpha=False, logger=None):
219  # variable for result
220  ret = None
221  first = second = ''
222  for index in range(0, len(wordsList)):
223  if index < len(wordsList) - 1:
224  if logger is not None:
225  logger.debug('cleanNoneAlpha: ' + str(cleanNoneAlpha))
226 
227  if cleanNoneAlpha:
228  first = AuthorType.removeNoneAlpha(wordsList[index])
229  second = AuthorType.removeNoneAlpha(wordsList[index + 1])
230 
231  firstList = first.split()
232  if len(firstList) > 0:
233  first = firstList[-1]
234 
235  secondList = second.split()
236  if len(secondList) > 0:
237  second = secondList[0]
238 
239  else:
240  first = wordsList[index]
241  second = wordsList[index + 1]
242 
243  if logger is not None:
244  logger.debug('first: ' + str(first) + ' second: ' + str(second))
245 
246  if (AuthorType.isGoodWord(first, minAllowedWordLength, logger) and \
247  AuthorType.isGoodWord(second, minAllowedWordLength, logger)) or \
248  (AuthorType.isGoodWord(first, minAllowedWordLength, logger) and second.isupper()) or \
249  (first.isupper() and AuthorType.isGoodWord(second, minAllowedWordLength, logger)):
250  ret = first + ' ' + second
251  break
252 
253  return ret
254 
255 
256  # #Extract author name
257  #
258  # @param wordsList - words list for extract
259  # @param minAllowedWordLength - min allowed length of word
260  # @param maxAllowedWordLength - max allowed length of word
261  # @param logger - instance of logger for log if necessary
262  # @return author name as string if success, otherwise None
263  @staticmethod
264  def extractAuthorName(wordsList, minAllowedWordLength, maxAllowedWordLength, logger=None):
265  # variable for result
266  ret = None
267 
268  for word in wordsList:
269  # first word with upper title
270  if AuthorType.isGoodWord(word, int(minAllowedWordLength)) and word != wordsList[0]:
271  ret = (AuthorType.removeNoneAlpha(word).strip())
272  if logger is not None:
273  logger.debug('Found first word with upper title: ' + str(ret))
274  break
275 
276  # extract from email
277  pos = word.find('@')
278  if pos > -1:
279  AuthorName = word[:pos]
280  if len(AuthorName) >= minAllowedWordLength and len(AuthorName) <= maxAllowedWordLength:
281  ret = AuthorName
282  if logger is not None:
283  logger.debug('Found author name in email: ' + str(ret))
284  break
285  else:
286  if logger is not None:
287  logger.debug("Candidate '" + str(AuthorName) + "' for extract from email didn't pass limits")
288 
289  # search two words was concatenated
290  if len(word) > minAllowedWordLength:
291  found = False
292  for index in range(0, len(word)):
293  if index > 0 and word[index - 1].isalpha() and word[index].isupper():
294  first = word[:index]
295  second = word[index:]
296  if AuthorType.isGoodWord(first, int(minAllowedWordLength)) and \
297  AuthorType.isGoodWord(second, int(minAllowedWordLength)):
298  ret = first + ' ' + second
299  if logger is not None:
300  logger.debug('Found author name from two concatinated words: ' + str(ret))
301  found = True
302  else:
303  if logger is not None:
304  logger.debug("Candidate '" + str(word) + \
305  "' for extract from two concatinated words didn't pass validate")
306  break
307  if found:
308  break
309 
310  # search nickname
311  if word.find('_') > -1:
312  wd = word.split('_')
313  if len(wd) > 0:
314  ret = wd[0]
315  if len(wd) > 1:
316  ret += (' ' + AuthorType.removeNoneAlpha(wd[1]).split()[0])
317  if logger is not None:
318  logger.debug('Found author name from nickname: ' + str(ret))
319  break
320 
321  return ret
322 
323 
324  # #Make parsing data string
325  #
326  # @param propDict - dictionary of properties
327  # @param dataString - string for extract
328  # @param logger - instance of logger for log if necessary
329  # @return string value - if success, otherwise None
330  @staticmethod
331  def makeParsing(propDict, dataString, logger=None):
332  wordsList = dataString.split()
333 
334  # Search pair: name, surname
335  ret = AuthorType.getPairNames(wordsList, int(propDict[AuthorType.MIN_BYTES_NAME]), False, None)
336  if logger is not None:
337  logger.debug('Search author as pair words: ' + str(ret))
338 
339  if ret is None and bool(propDict[AuthorType.CLEAN_NONE_ALPHA_NAME]):
340  ret = AuthorType.getPairNames(wordsList, int(propDict[AuthorType.MIN_BYTES_NAME]), True)
341  if logger is not None:
342  logger.debug('Search author as pair words after clean not alpha: ' + str(ret))
343 
344  if ret is None:
345  ret = AuthorType.extractAuthorName(wordsList, int(propDict[AuthorType.MIN_BYTES_NAME]),
346  int(propDict[AuthorType.MAX_CHARS_WORD_NAME]), logger)
347  if logger is not None:
348  logger.debug('makeParsing return: ' + str(ret))
349 
350  return ret
351 
352 
353  # # static method for parse
354  #
355  # @param confProp - properties as JSON already read from config file
356  # @param procProp - properties as JSON from PROCESSOR_PROPERTIES
357  # @param dataString - string for extract
358  # @param logger - instance of logger for log if necessary
359  # @return extracted author as string or None
360  @staticmethod
361  def parse(confProp, procProp, dataString, logger=None):
362  # variable for result
363  ret = None
364  try:
365  if logger is not None:
366  logger.debug('input raw data to parse: ' + str(dataString))
367 
368  if not isinstance(dataString, str) and not isinstance(dataString, unicode):
369  raise Exception(AuthorType.ERROR_DATA_STRING_TYPE + ' type: ' + str(type(dataString)))
370 
371  propDict = AuthorType.mergeProperties(confProp, procProp)
372  if logger is not None:
373  logger.debug('merged properties: ' + str(propDict))
374 
375  isGood = AuthorType.checkDataStringLimits(propDict, dataString, logger)
376  if logger is not None:
377  logger.debug('isGood: ' + str(bool(isGood)))
378 
379  # check mismatch
380  if isGood:
381  ret = AuthorType.makeParsing(propDict, str(dataString), logger)
382  else:
383  if propDict[AuthorType.MISMATCH_NAME] == AuthorType.MISMATCH_VALUE_EMPTY:
384  ret = ''
385  elif propDict[AuthorType.MISMATCH_NAME] == AuthorType.MISMATCH_VALUE_IGNORE:
386  ret = dataString
387  elif propDict[AuthorType.MISMATCH_NAME] == AuthorType.MISMATCH_VALUE_VALUE:
388  ret = propDict[AuthorType.VALUE_NAME]
389  elif propDict[AuthorType.MISMATCH_NAME] == AuthorType.MISMATCH_VALUE_PARSE:
390  ret = AuthorType.makeParsing(propDict, dataString, logger)
391  else:
392  ret = ''
393 
394  # check undetected
395  if ret is None:
396  if propDict[AuthorType.UNDETECTED_NAME] == AuthorType.UNDETECTED_VALUE_EMPTY:
397  ret = ''
398  elif propDict[AuthorType.UNDETECTED_NAME] == AuthorType.UNDETECTED_VALUE_IGNORE:
399  ret = dataString
400  elif propDict[AuthorType.UNDETECTED_NAME] == AuthorType.UNDETECTED_VALUE_VALUE:
401  ret = propDict[AuthorType.VALUE_NAME]
402  else:
403  ret = ''
404 
405  except Exception, err:
406  if logger is not None:
407  logger.debug('Error: ' + str(err))
408 
409  return ret
def getPairNames(wordsList, minAllowedWordLength, cleanNoneAlpha=False, logger=None)
Definition: AuthorType.py:218
def makeParsing(propDict, dataString, logger=None)
Definition: AuthorType.py:331
def extractAuthorName(wordsList, minAllowedWordLength, maxAllowedWordLength, logger=None)
Definition: AuthorType.py:264
def mergeProperties(confProp, procProp)
Definition: AuthorType.py:96
def parse(confProp, procProp, dataString, logger=None)
Definition: AuthorType.py:361
def isGoodWord(word, minAllowedWordLength, logger=None)
Definition: AuthorType.py:176
def checkDataStringLimits(propDict, dataString, logger=None)
Definition: AuthorType.py:145
Definition: join.py:1
def __init__(self, confProp=None, procProp=None, dataString=None, logger=None)
Definition: AuthorType.py:66