3 HCE project, Python bindings, Distributed Tasks Manager application. 4 AuthorType Class content main functional extract of author data. 8 @author Alexander Vybornyh <alexander.hce.cluster@gmail.com> 9 @link: http://hierarchical-cluster-engine.com/ 10 @copyright: Copyright © 2013-2015 IOIX Ukraine 11 @license: http://hierarchical-cluster-engine.com/license/ 23 MAIN_TAG_NAME =
'author' 25 MIN_WORDS_NAME =
'min_words' 26 MAX_WORDS_NAME =
'max_words' 27 MIN_BYTES_NAME =
'min_bytes' 28 MAX_BYTES_NAME =
'max_bytes' 29 MAX_CHARS_WORD_NAME =
'max_chars_word' 30 CLEAN_NONE_ALPHA_NAME =
'clean_none_alpha' 32 MISMATCH_NAME =
'mismatch' 33 UNDETECTED_NAME =
'undetected' 35 MISMATCH_VALUE_EMPTY =
'empty' 36 MISMATCH_VALUE_IGNORE =
'ignore' 37 MISMATCH_VALUE_VALUE =
'value' 38 MISMATCH_VALUE_PARSE =
'parse' 40 UNDETECTED_VALUE_EMPTY =
'empty' 41 UNDETECTED_VALUE_IGNORE =
'ignore' 42 UNDETECTED_VALUE_VALUE =
'value' 44 MIN_WORDS_DEFAULT_VALUE = 1
45 MAX_WORDS_DEFAULT_VALUE = 8
46 MIN_BYTES_DEFAULT_VALUE = 3
47 MAX_BYTES_DEFAULT_VALUE = 32
48 MAX_CHARS_WORD_DEFAULT_VALUE = MAX_BYTES_DEFAULT_VALUE
49 CLEAN_NONE_ALPHA_DEFAULT_VALUE = 1
50 MISMATCH_DEFAULT_VALUE = MISMATCH_VALUE_EMPTY
51 UNDETECTED_DEFAULT_VALUE = UNDETECTED_VALUE_EMPTY
55 ERROR_DATA_STRING_TYPE =
'Data string is not string.' 56 ERROR_CONFIG_PROPERTY_TYPE =
'Config property type is wrong' 57 ERROR_PROCESSOR_PROPERTY_TYPE =
'Processor property type is wrong' 58 ERROR_MAIN_TAG_NAME =
"Main tag name '" + str(MAIN_TAG_NAME) +
"' not found" 66 def __init__(self, confProp=None, procProp=None, dataString=None, logger=None):
67 self.
author = AuthorType.parse(confProp, procProp, dataString, logger)
79 propDict[AuthorType.MIN_WORDS_NAME] = AuthorType.MIN_WORDS_DEFAULT_VALUE
80 propDict[AuthorType.MAX_WORDS_NAME] = AuthorType.MAX_WORDS_DEFAULT_VALUE
81 propDict[AuthorType.MIN_BYTES_NAME] = AuthorType.MIN_BYTES_DEFAULT_VALUE
82 propDict[AuthorType.MAX_BYTES_NAME] = AuthorType.MAX_BYTES_DEFAULT_VALUE
83 propDict[AuthorType.MAX_CHARS_WORD_NAME] = AuthorType.MAX_CHARS_WORD_DEFAULT_VALUE
84 propDict[AuthorType.CLEAN_NONE_ALPHA_NAME] = AuthorType.CLEAN_NONE_ALPHA_DEFAULT_VALUE
85 propDict[AuthorType.MISMATCH_NAME] = AuthorType.MISMATCH_DEFAULT_VALUE
86 propDict[AuthorType.UNDETECTED_NAME] = AuthorType.UNDETECTED_DEFAULT_VALUE
98 if confProp
is not None and not (isinstance(confProp, str)
or isinstance(confProp, unicode)
or\
99 isinstance(confProp, dict)):
100 raise Exception(AuthorType.ERROR_CONFIG_PROPERTY_TYPE +
': ' + str(
type(confProp)))
102 if procProp
is not None and not (isinstance(procProp, str)
or isinstance(procProp, unicode)
or\
103 isinstance(procProp, dict)):
104 raise Exception(AuthorType.ERROR_PROCESSOR_PROPERTY_TYPE +
': ' + str(
type(procProp)))
107 propDict = AuthorType.getDefaultProperties()
111 if confProp
is not None:
112 if not isinstance(confProp, dict):
113 confPropDict = json.loads(confProp)
115 confPropDict = confProp
117 if not confPropDict.has_key(AuthorType.MAIN_TAG_NAME):
118 raise Exception(AuthorType.ERROR_MAIN_TAG_NAME)
120 propDict.update(confPropDict[AuthorType.MAIN_TAG_NAME])
124 if procProp
is not None:
125 if not isinstance(procProp, dict):
126 procPropDict = json.loads(procProp)
128 procPropDict = procProp
130 if not procPropDict.has_key(AuthorType.MAIN_TAG_NAME):
131 raise Exception(AuthorType.ERROR_MAIN_TAG_NAME)
133 propDict.update(procPropDict[AuthorType.MAIN_TAG_NAME])
149 bytesCount = len(dataString)
151 for word
in dataString.split():
152 if len(word) >= int(propDict[AuthorType.MIN_BYTES_NAME]):
155 if logger
is not None:
156 logger.debug(
'bytesCount = ' + str(bytesCount))
157 logger.debug(
'wordsCount = ' + str(wordsCount))
160 if bytesCount >= int(propDict[AuthorType.MIN_BYTES_NAME])
and \
161 bytesCount <= int(propDict[AuthorType.MAX_BYTES_NAME])
and \
162 wordsCount >= int(propDict[AuthorType.MIN_WORDS_NAME])
and \
163 wordsCount <= int(propDict[AuthorType.MAX_WORDS_NAME]):
178 if logger
is not None:
179 logger.debug(
'word: ' + str(word) +
' minAllowedWordLength = ' + str(minAllowedWordLength))
180 logger.debug(
'word.istitle(): ' + str(bool(unicode(word,
'utf-8').istitle())))
184 if len(word) >= minAllowedWordLength:
185 if unicode(word,
'utf-8').istitle():
188 if logger
is not None:
189 logger.debug(
'ret = ' + str(ret))
218 def getPairNames(wordsList, minAllowedWordLength, cleanNoneAlpha=False, logger=None):
222 for index
in range(0, len(wordsList)):
223 if index < len(wordsList) - 1:
224 if logger
is not None:
225 logger.debug(
'cleanNoneAlpha: ' + str(cleanNoneAlpha))
228 first = AuthorType.removeNoneAlpha(wordsList[index])
229 second = AuthorType.removeNoneAlpha(wordsList[index + 1])
231 firstList = first.split()
232 if len(firstList) > 0:
233 first = firstList[-1]
235 secondList = second.split()
236 if len(secondList) > 0:
237 second = secondList[0]
240 first = wordsList[index]
241 second = wordsList[index + 1]
243 if logger
is not None:
244 logger.debug(
'first: ' + str(first) +
' second: ' + str(second))
246 if (AuthorType.isGoodWord(first, minAllowedWordLength, logger)
and \
247 AuthorType.isGoodWord(second, minAllowedWordLength, logger))
or \
248 (AuthorType.isGoodWord(first, minAllowedWordLength, logger)
and second.isupper())
or \
249 (first.isupper()
and AuthorType.isGoodWord(second, minAllowedWordLength, logger)):
250 ret = first +
' ' + second
268 for word
in wordsList:
270 if AuthorType.isGoodWord(word, int(minAllowedWordLength))
and word != wordsList[0]:
271 ret = (AuthorType.removeNoneAlpha(word).strip())
272 if logger
is not None:
273 logger.debug(
'Found first word with upper title: ' + str(ret))
279 AuthorName = word[:pos]
280 if len(AuthorName) >= minAllowedWordLength
and len(AuthorName) <= maxAllowedWordLength:
282 if logger
is not None:
283 logger.debug(
'Found author name in email: ' + str(ret))
286 if logger
is not None:
287 logger.debug(
"Candidate '" + str(AuthorName) +
"' for extract from email didn't pass limits")
290 if len(word) > minAllowedWordLength:
292 for index
in range(0, len(word)):
293 if index > 0
and word[index - 1].isalpha()
and word[index].isupper():
295 second = word[index:]
296 if AuthorType.isGoodWord(first, int(minAllowedWordLength))
and \
297 AuthorType.isGoodWord(second, int(minAllowedWordLength)):
298 ret = first +
' ' + second
299 if logger
is not None:
300 logger.debug(
'Found author name from two concatinated words: ' + str(ret))
303 if logger
is not None:
304 logger.debug(
"Candidate '" + str(word) + \
305 "' for extract from two concatinated words didn't pass validate")
311 if word.find(
'_') > -1:
316 ret += (
' ' + AuthorType.removeNoneAlpha(wd[1]).split()[0])
317 if logger
is not None:
318 logger.debug(
'Found author name from nickname: ' + str(ret))
332 wordsList = dataString.split()
335 ret = AuthorType.getPairNames(wordsList, int(propDict[AuthorType.MIN_BYTES_NAME]),
False,
None)
336 if logger
is not None:
337 logger.debug(
'Search author as pair words: ' + str(ret))
339 if ret
is None and bool(propDict[AuthorType.CLEAN_NONE_ALPHA_NAME]):
340 ret = AuthorType.getPairNames(wordsList, int(propDict[AuthorType.MIN_BYTES_NAME]),
True)
341 if logger
is not None:
342 logger.debug(
'Search author as pair words after clean not alpha: ' + str(ret))
345 ret = AuthorType.extractAuthorName(wordsList, int(propDict[AuthorType.MIN_BYTES_NAME]),
346 int(propDict[AuthorType.MAX_CHARS_WORD_NAME]), logger)
347 if logger
is not None:
348 logger.debug(
'makeParsing return: ' + str(ret))
361 def parse(confProp, procProp, dataString, logger=None):
365 if logger
is not None:
366 logger.debug(
'input raw data to parse: ' + str(dataString))
368 if not isinstance(dataString, str)
and not isinstance(dataString, unicode):
369 raise Exception(AuthorType.ERROR_DATA_STRING_TYPE +
' type: ' + str(
type(dataString)))
371 propDict = AuthorType.mergeProperties(confProp, procProp)
372 if logger
is not None:
373 logger.debug(
'merged properties: ' + str(propDict))
375 isGood = AuthorType.checkDataStringLimits(propDict, dataString, logger)
376 if logger
is not None:
377 logger.debug(
'isGood: ' + str(bool(isGood)))
381 ret = AuthorType.makeParsing(propDict, str(dataString), logger)
383 if propDict[AuthorType.MISMATCH_NAME] == AuthorType.MISMATCH_VALUE_EMPTY:
385 elif propDict[AuthorType.MISMATCH_NAME] == AuthorType.MISMATCH_VALUE_IGNORE:
387 elif propDict[AuthorType.MISMATCH_NAME] == AuthorType.MISMATCH_VALUE_VALUE:
388 ret = propDict[AuthorType.VALUE_NAME]
389 elif propDict[AuthorType.MISMATCH_NAME] == AuthorType.MISMATCH_VALUE_PARSE:
390 ret = AuthorType.makeParsing(propDict, dataString, logger)
396 if propDict[AuthorType.UNDETECTED_NAME] == AuthorType.UNDETECTED_VALUE_EMPTY:
398 elif propDict[AuthorType.UNDETECTED_NAME] == AuthorType.UNDETECTED_VALUE_IGNORE:
400 elif propDict[AuthorType.UNDETECTED_NAME] == AuthorType.UNDETECTED_VALUE_VALUE:
401 ret = propDict[AuthorType.VALUE_NAME]
405 except Exception, err:
406 if logger
is not None:
407 logger.debug(
'Error: ' + str(err))
def removeNoneAlpha(word)
def getPairNames(wordsList, minAllowedWordLength, cleanNoneAlpha=False, logger=None)
def makeParsing(propDict, dataString, logger=None)
def getDefaultProperties()
def extractAuthorName(wordsList, minAllowedWordLength, maxAllowedWordLength, logger=None)
def mergeProperties(confProp, procProp)
def parse(confProp, procProp, dataString, logger=None)
def isGoodWord(word, minAllowedWordLength, logger=None)
def checkDataStringLimits(propDict, dataString, logger=None)
def __init__(self, confProp=None, procProp=None, dataString=None, logger=None)