Inheritance diagram for dc_processor.AuthorType.AuthorType:

Collaboration diagram for dc_processor.AuthorType.AuthorType:

Public Member Functions
def	__init__ (self, confProp=None, procProp=None, dataString=None, logger=None)

Static Public Member Functions
def	getDefaultProperties ()

def	mergeProperties (confProp, procProp)

def	checkDataStringLimits (propDict, dataString, logger=None)

def	isGoodWord (word, minAllowedWordLength, logger=None)

def	removeNoneAlpha (word)

def	getPairNames (wordsList, minAllowedWordLength, cleanNoneAlpha=False, logger=None)

def	extractAuthorName (wordsList, minAllowedWordLength, maxAllowedWordLength, logger=None)

def	makeParsing (propDict, dataString, logger=None)

def	parse (confProp, procProp, dataString, logger=None)

Public Attributes
	author

Static Public Attributes
string	MAIN_TAG_NAME = 'author'

string	MIN_WORDS_NAME = 'min_words'

string	MAX_WORDS_NAME = 'max_words'

string	MIN_BYTES_NAME = 'min_bytes'

string	MAX_BYTES_NAME = 'max_bytes'

string	MAX_CHARS_WORD_NAME = 'max_chars_word'

string	CLEAN_NONE_ALPHA_NAME = 'clean_none_alpha'

string	VALUE_NAME = 'value'

string	MISMATCH_NAME = 'mismatch'

string	UNDETECTED_NAME = 'undetected'

string	MISMATCH_VALUE_EMPTY = 'empty'

string	MISMATCH_VALUE_IGNORE = 'ignore'

string	MISMATCH_VALUE_VALUE = 'value'

string	MISMATCH_VALUE_PARSE = 'parse'

string	UNDETECTED_VALUE_EMPTY = 'empty'

string	UNDETECTED_VALUE_IGNORE = 'ignore'

string	UNDETECTED_VALUE_VALUE = 'value'

int	MIN_WORDS_DEFAULT_VALUE = 1

int	MAX_WORDS_DEFAULT_VALUE = 8

int	MIN_BYTES_DEFAULT_VALUE = 3

int	MAX_BYTES_DEFAULT_VALUE = 32

int	MAX_CHARS_WORD_DEFAULT_VALUE = MAX_BYTES_DEFAULT_VALUE

int	CLEAN_NONE_ALPHA_DEFAULT_VALUE = 1

string	MISMATCH_DEFAULT_VALUE = MISMATCH_VALUE_EMPTY

string	UNDETECTED_DEFAULT_VALUE = UNDETECTED_VALUE_EMPTY

string	ERROR_DATA_STRING_TYPE = 'Data string is not string.'

string	ERROR_CONFIG_PROPERTY_TYPE = 'Config property type is wrong'

string	ERROR_PROCESSOR_PROPERTY_TYPE = 'Processor property type is wrong'

string	ERROR_MAIN_TAG_NAME = "Main tag name '" + str(MAIN_TAG_NAME) + "' not found"

Detailed Description

Definition at line 20 of file AuthorType.py.

Constructor & Destructor Documentation

◆ init()

def dc_processor.AuthorType.AuthorType.__init__	(	self,
		confProp = `None`,
		procProp = `None`,
		dataString = `None`,
		logger = `None`
	)

Definition at line 66 of file AuthorType.py.

   def __init__(self, confProp=None, procProp=None, dataString=None, logger=None):
     self.author = AuthorType.parse(confProp, procProp, dataString, logger)

Member Function Documentation

◆ checkDataStringLimits()

def dc_processor.AuthorType.AuthorType.checkDataStringLimits	(	propDict,
		dataString,
		logger = `None`
	)

static

Definition at line 145 of file AuthorType.py.

   def checkDataStringLimits(propDict, dataString, logger=None):
     # variable for result
     ret = False
 
     bytesCount = len(dataString)
     wordsCount = 0
     for word in dataString.split():
       if len(word) >= int(propDict[AuthorType.MIN_BYTES_NAME]):
         wordsCount += 1
 
     if logger is not None:
       logger.debug('bytesCount = ' + str(bytesCount))
       logger.debug('wordsCount = ' + str(wordsCount))
 
     # check limits
     if bytesCount >= int(propDict[AuthorType.MIN_BYTES_NAME]) and \
     bytesCount <= int(propDict[AuthorType.MAX_BYTES_NAME]) and \
     wordsCount >= int(propDict[AuthorType.MIN_WORDS_NAME]) and \
     wordsCount <= int(propDict[AuthorType.MAX_WORDS_NAME]):
       ret = True
 
     return ret
 
 

◆ extractAuthorName()

def dc_processor.AuthorType.AuthorType.extractAuthorName	(	wordsList,
		minAllowedWordLength,
		maxAllowedWordLength,
		logger = `None`
	)

static

Definition at line 264 of file AuthorType.py.

   def extractAuthorName(wordsList, minAllowedWordLength, maxAllowedWordLength, logger=None):
     # variable for result
     ret = None
 
     for word in wordsList:
       # first word with upper title
       if AuthorType.isGoodWord(word, int(minAllowedWordLength)) and word != wordsList[0]:
         ret = (AuthorType.removeNoneAlpha(word).strip())
         if logger is not None:
           logger.debug('Found first word with upper title: ' + str(ret))
         break
 
       # extract from email
       pos = word.find('@')
       if pos > -1:
         AuthorName = word[:pos]
         if len(AuthorName) >= minAllowedWordLength and len(AuthorName) <= maxAllowedWordLength:
           ret = AuthorName
           if logger is not None:
             logger.debug('Found author name in email: ' + str(ret))
           break
         else:
           if logger is not None:
             logger.debug("Candidate '" + str(AuthorName) + "' for extract from email didn't pass limits")
 
       # search two words was concatenated
       if len(word) > minAllowedWordLength:
         found = False
         for index in range(0, len(word)):
           if index > 0 and word[index - 1].isalpha() and word[index].isupper():
             first = word[:index]
             second = word[index:]
             if AuthorType.isGoodWord(first, int(minAllowedWordLength)) and \
             AuthorType.isGoodWord(second, int(minAllowedWordLength)):
               ret = first + ' ' + second
               if logger is not None:
                 logger.debug('Found author name from two concatinated words: ' + str(ret))
               found = True
             else:
               if logger is not None:
                 logger.debug("Candidate '" + str(word) + \
                               "' for extract from two concatinated words didn't pass validate")
             break
         if found:
           break
 
       # search nickname
       if word.find('_') > -1:
         wd = word.split('_')
         if len(wd) > 0:
           ret = wd[0]
           if len(wd) > 1:
             ret += (' ' + AuthorType.removeNoneAlpha(wd[1]).split()[0])
             if logger is not None:
               logger.debug('Found author name from nickname: ' + str(ret))
             break
 
     return ret
 
 

◆ getDefaultProperties()

def dc_processor.AuthorType.AuthorType.getDefaultProperties ( )

static

Definition at line 75 of file AuthorType.py.

   def getDefaultProperties():
     # variable for result
     propDict = {}
     # initialization use default values
     propDict[AuthorType.MIN_WORDS_NAME] = AuthorType.MIN_WORDS_DEFAULT_VALUE
     propDict[AuthorType.MAX_WORDS_NAME] = AuthorType.MAX_WORDS_DEFAULT_VALUE
     propDict[AuthorType.MIN_BYTES_NAME] = AuthorType.MIN_BYTES_DEFAULT_VALUE
     propDict[AuthorType.MAX_BYTES_NAME] = AuthorType.MAX_BYTES_DEFAULT_VALUE
     propDict[AuthorType.MAX_CHARS_WORD_NAME] = AuthorType.MAX_CHARS_WORD_DEFAULT_VALUE
     propDict[AuthorType.CLEAN_NONE_ALPHA_NAME] = AuthorType.CLEAN_NONE_ALPHA_DEFAULT_VALUE
     propDict[AuthorType.MISMATCH_NAME] = AuthorType.MISMATCH_DEFAULT_VALUE
     propDict[AuthorType.UNDETECTED_NAME] = AuthorType.UNDETECTED_DEFAULT_VALUE
 
     return propDict
 
 

◆ getPairNames()

def dc_processor.AuthorType.AuthorType.getPairNames	(	wordsList,
		minAllowedWordLength,
		cleanNoneAlpha = `False`,
		logger = `None`
	)

static

Definition at line 218 of file AuthorType.py.

   def getPairNames(wordsList, minAllowedWordLength, cleanNoneAlpha=False, logger=None):
     # variable for result
     ret = None
     first = second = ''
     for index in range(0, len(wordsList)):
       if index < len(wordsList) - 1:
         if logger is not None:
           logger.debug('cleanNoneAlpha: ' + str(cleanNoneAlpha))
 
         if cleanNoneAlpha:
           first = AuthorType.removeNoneAlpha(wordsList[index])
           second = AuthorType.removeNoneAlpha(wordsList[index + 1])
 
           firstList = first.split()
           if len(firstList) > 0:
             first = firstList[-1]
 
           secondList = second.split()
           if len(secondList) > 0:
             second = secondList[0]
 
         else:
           first = wordsList[index]
           second = wordsList[index + 1]
 
         if logger is not None:
           logger.debug('first: ' + str(first) + ' second: ' + str(second))
 
         if (AuthorType.isGoodWord(first, minAllowedWordLength, logger) and \
             AuthorType.isGoodWord(second, minAllowedWordLength, logger)) or \
             (AuthorType.isGoodWord(first, minAllowedWordLength, logger) and second.isupper()) or \
             (first.isupper() and AuthorType.isGoodWord(second, minAllowedWordLength, logger)):
           ret = first + ' ' + second
           break
 
     return ret
 
 

◆ isGoodWord()

def dc_processor.AuthorType.AuthorType.isGoodWord	(	word,
		minAllowedWordLength,
		logger = `None`
	)

static

Definition at line 176 of file AuthorType.py.

   def isGoodWord(word, minAllowedWordLength, logger=None):
 
     if logger is not None:
       logger.debug('word: ' + str(word) + ' minAllowedWordLength = ' + str(minAllowedWordLength))
       logger.debug('word.istitle(): ' + str(bool(unicode(word, 'utf-8').istitle())))
 
     # variable for result
     ret = False
     if len(word) >= minAllowedWordLength:
       if unicode(word, 'utf-8').istitle():
         ret = True
 
     if logger is not None:
       logger.debug('ret = ' + str(ret))
 
     return ret
 
 

◆ makeParsing()

def dc_processor.AuthorType.AuthorType.makeParsing	(	propDict,
		dataString,
		logger = `None`
	)

static

Definition at line 331 of file AuthorType.py.

   def makeParsing(propDict, dataString, logger=None):
     wordsList = dataString.split()
 
     # Search pair: name, surname
     ret = AuthorType.getPairNames(wordsList, int(propDict[AuthorType.MIN_BYTES_NAME]), False, None)
     if logger is not None:
       logger.debug('Search author as pair words: ' + str(ret))
 
     if ret is None and bool(propDict[AuthorType.CLEAN_NONE_ALPHA_NAME]):
       ret = AuthorType.getPairNames(wordsList, int(propDict[AuthorType.MIN_BYTES_NAME]), True)
       if logger is not None:
         logger.debug('Search author as pair words after clean not alpha: ' + str(ret))
 
     if ret is None:
       ret = AuthorType.extractAuthorName(wordsList, int(propDict[AuthorType.MIN_BYTES_NAME]),
                                          int(propDict[AuthorType.MAX_CHARS_WORD_NAME]), logger)
       if logger is not None:
         logger.debug('makeParsing return: ' + str(ret))
 
     return ret
 
 

◆ mergeProperties()

def dc_processor.AuthorType.AuthorType.mergeProperties	(	confProp,
		procProp
	)

static

Definition at line 96 of file AuthorType.py.

   def mergeProperties(confProp, procProp):
 
     if confProp is not None and not (isinstance(confProp, str) or isinstance(confProp, unicode) or\
                                      isinstance(confProp, dict)):
       raise Exception(AuthorType.ERROR_CONFIG_PROPERTY_TYPE + ': ' + str(type(confProp)))
 
     if procProp is not None and not (isinstance(procProp, str) or isinstance(procProp, unicode) or\
                                      isinstance(procProp, dict)):
       raise Exception(AuthorType.ERROR_PROCESSOR_PROPERTY_TYPE + ': ' + str(type(procProp)))
 
     # variable for result
     propDict = AuthorType.getDefaultProperties()
 
     # update variables from config file
     confPropDict = {}
     if confProp is not None:
       if not isinstance(confProp, dict):
         confPropDict = json.loads(confProp)
       else:
         confPropDict = confProp
 
       if not confPropDict.has_key(AuthorType.MAIN_TAG_NAME):
         raise Exception(AuthorType.ERROR_MAIN_TAG_NAME)
 
       propDict.update(confPropDict[AuthorType.MAIN_TAG_NAME])
 
     # update variables from PROCESSOR_PROPERTIES
     procPropDict = {}
     if procProp is not None:
       if not isinstance(procProp, dict):
         procPropDict = json.loads(procProp)
       else:
         procPropDict = procProp
 
       if not procPropDict.has_key(AuthorType.MAIN_TAG_NAME):
         raise Exception(AuthorType.ERROR_MAIN_TAG_NAME)
 
       propDict.update(procPropDict[AuthorType.MAIN_TAG_NAME])
 
     return propDict
 
 

◆ parse()

def dc_processor.AuthorType.AuthorType.parse	(	confProp,
		procProp,
		dataString,
		logger = `None`
	)

static

Definition at line 361 of file AuthorType.py.

   def parse(confProp, procProp, dataString, logger=None):
     # variable for result
     ret = None
     try:
       if logger is not None:
         logger.debug('input raw data to parse: ' + str(dataString))
 
       if not isinstance(dataString, str) and not isinstance(dataString, unicode):
         raise Exception(AuthorType.ERROR_DATA_STRING_TYPE + ' type: ' + str(type(dataString)))
 
       propDict = AuthorType.mergeProperties(confProp, procProp)
       if logger is not None:
         logger.debug('merged properties: ' + str(propDict))
 
       isGood = AuthorType.checkDataStringLimits(propDict, dataString, logger)
       if logger is not None:
         logger.debug('isGood: ' + str(bool(isGood)))
 
       # check mismatch
       if isGood:
         ret = AuthorType.makeParsing(propDict, str(dataString), logger)
       else:
         if propDict[AuthorType.MISMATCH_NAME] == AuthorType.MISMATCH_VALUE_EMPTY:
           ret = ''
         elif propDict[AuthorType.MISMATCH_NAME] == AuthorType.MISMATCH_VALUE_IGNORE:
           ret = dataString
         elif propDict[AuthorType.MISMATCH_NAME] == AuthorType.MISMATCH_VALUE_VALUE:
           ret = propDict[AuthorType.VALUE_NAME]
         elif propDict[AuthorType.MISMATCH_NAME] == AuthorType.MISMATCH_VALUE_PARSE:
           ret = AuthorType.makeParsing(propDict, dataString, logger)
         else:
           ret = ''
 
       # check undetected
       if ret is None:
         if propDict[AuthorType.UNDETECTED_NAME] == AuthorType.UNDETECTED_VALUE_EMPTY:
           ret = ''
         elif propDict[AuthorType.UNDETECTED_NAME] == AuthorType.UNDETECTED_VALUE_IGNORE:
           ret = dataString
         elif propDict[AuthorType.UNDETECTED_NAME] == AuthorType.UNDETECTED_VALUE_VALUE:
           ret = propDict[AuthorType.VALUE_NAME]
         else:
           ret = ''
 
     except Exception, err:
       if logger is not None:
         logger.debug('Error: ' + str(err))
 
     return ret

◆ removeNoneAlpha()

def dc_processor.AuthorType.AuthorType.removeNoneAlpha ( word )

static

Definition at line 199 of file AuthorType.py.

   def removeNoneAlpha(word):
     wd = []
     for s in word:
       if s.isalpha():
         wd.append(s)
       else:
         wd.append(' ')
 
     return ''.join(wd)
 
 

Member Data Documentation

◆ author

dc_processor.AuthorType.AuthorType.author

Definition at line 67 of file AuthorType.py.

◆ CLEAN_NONE_ALPHA_DEFAULT_VALUE

int dc_processor.AuthorType.AuthorType.CLEAN_NONE_ALPHA_DEFAULT_VALUE = 1

static

Definition at line 49 of file AuthorType.py.

◆ CLEAN_NONE_ALPHA_NAME

string dc_processor.AuthorType.AuthorType.CLEAN_NONE_ALPHA_NAME = 'clean_none_alpha'

static

Definition at line 30 of file AuthorType.py.

◆ ERROR_CONFIG_PROPERTY_TYPE

string dc_processor.AuthorType.AuthorType.ERROR_CONFIG_PROPERTY_TYPE = 'Config property type is wrong'

static

Definition at line 56 of file AuthorType.py.

◆ ERROR_DATA_STRING_TYPE

string dc_processor.AuthorType.AuthorType.ERROR_DATA_STRING_TYPE = 'Data string is not string.'

static

Definition at line 55 of file AuthorType.py.

◆ ERROR_MAIN_TAG_NAME

string dc_processor.AuthorType.AuthorType.ERROR_MAIN_TAG_NAME = "Main tag name '" + str(MAIN_TAG_NAME) + "' not found"

static

Definition at line 58 of file AuthorType.py.

◆ ERROR_PROCESSOR_PROPERTY_TYPE

string dc_processor.AuthorType.AuthorType.ERROR_PROCESSOR_PROPERTY_TYPE = 'Processor property type is wrong'

static

Definition at line 57 of file AuthorType.py.

◆ MAIN_TAG_NAME

string dc_processor.AuthorType.AuthorType.MAIN_TAG_NAME = 'author'

static

Definition at line 23 of file AuthorType.py.

◆ MAX_BYTES_DEFAULT_VALUE

int dc_processor.AuthorType.AuthorType.MAX_BYTES_DEFAULT_VALUE = 32

static

Definition at line 47 of file AuthorType.py.

◆ MAX_BYTES_NAME

string dc_processor.AuthorType.AuthorType.MAX_BYTES_NAME = 'max_bytes'

static

Definition at line 28 of file AuthorType.py.

◆ MAX_CHARS_WORD_DEFAULT_VALUE

int dc_processor.AuthorType.AuthorType.MAX_CHARS_WORD_DEFAULT_VALUE = MAX_BYTES_DEFAULT_VALUE

static

Definition at line 48 of file AuthorType.py.

◆ MAX_CHARS_WORD_NAME

string dc_processor.AuthorType.AuthorType.MAX_CHARS_WORD_NAME = 'max_chars_word'

static

Definition at line 29 of file AuthorType.py.

◆ MAX_WORDS_DEFAULT_VALUE

int dc_processor.AuthorType.AuthorType.MAX_WORDS_DEFAULT_VALUE = 8

static

Definition at line 45 of file AuthorType.py.

◆ MAX_WORDS_NAME

string dc_processor.AuthorType.AuthorType.MAX_WORDS_NAME = 'max_words'

static

Definition at line 26 of file AuthorType.py.

◆ MIN_BYTES_DEFAULT_VALUE

int dc_processor.AuthorType.AuthorType.MIN_BYTES_DEFAULT_VALUE = 3

static

Definition at line 46 of file AuthorType.py.

◆ MIN_BYTES_NAME

string dc_processor.AuthorType.AuthorType.MIN_BYTES_NAME = 'min_bytes'

static

Definition at line 27 of file AuthorType.py.

◆ MIN_WORDS_DEFAULT_VALUE

int dc_processor.AuthorType.AuthorType.MIN_WORDS_DEFAULT_VALUE = 1

static

Definition at line 44 of file AuthorType.py.

◆ MIN_WORDS_NAME

string dc_processor.AuthorType.AuthorType.MIN_WORDS_NAME = 'min_words'

static

Definition at line 25 of file AuthorType.py.

◆ MISMATCH_DEFAULT_VALUE

string dc_processor.AuthorType.AuthorType.MISMATCH_DEFAULT_VALUE = MISMATCH_VALUE_EMPTY

static

Definition at line 50 of file AuthorType.py.

◆ MISMATCH_NAME

string dc_processor.AuthorType.AuthorType.MISMATCH_NAME = 'mismatch'

static

Definition at line 32 of file AuthorType.py.

◆ MISMATCH_VALUE_EMPTY

string dc_processor.AuthorType.AuthorType.MISMATCH_VALUE_EMPTY = 'empty'

static

Definition at line 35 of file AuthorType.py.

◆ MISMATCH_VALUE_IGNORE

string dc_processor.AuthorType.AuthorType.MISMATCH_VALUE_IGNORE = 'ignore'

static

Definition at line 36 of file AuthorType.py.

◆ MISMATCH_VALUE_PARSE

string dc_processor.AuthorType.AuthorType.MISMATCH_VALUE_PARSE = 'parse'

static

Definition at line 38 of file AuthorType.py.

◆ MISMATCH_VALUE_VALUE

string dc_processor.AuthorType.AuthorType.MISMATCH_VALUE_VALUE = 'value'

static

Definition at line 37 of file AuthorType.py.

◆ UNDETECTED_DEFAULT_VALUE

string dc_processor.AuthorType.AuthorType.UNDETECTED_DEFAULT_VALUE = UNDETECTED_VALUE_EMPTY

static

Definition at line 51 of file AuthorType.py.

◆ UNDETECTED_NAME

string dc_processor.AuthorType.AuthorType.UNDETECTED_NAME = 'undetected'

static

Definition at line 33 of file AuthorType.py.

◆ UNDETECTED_VALUE_EMPTY

string dc_processor.AuthorType.AuthorType.UNDETECTED_VALUE_EMPTY = 'empty'

static

Definition at line 40 of file AuthorType.py.

◆ UNDETECTED_VALUE_IGNORE

string dc_processor.AuthorType.AuthorType.UNDETECTED_VALUE_IGNORE = 'ignore'

static

Definition at line 41 of file AuthorType.py.

◆ UNDETECTED_VALUE_VALUE

string dc_processor.AuthorType.AuthorType.UNDETECTED_VALUE_VALUE = 'value'

static

Definition at line 42 of file AuthorType.py.

◆ VALUE_NAME

string dc_processor.AuthorType.AuthorType.VALUE_NAME = 'value'

static

Definition at line 31 of file AuthorType.py.

The documentation for this class was generated from the following file:

sources/hce/dc_processor/AuthorType.py

Public Member Functions

Static Public Member Functions

Public Attributes

Static Public Attributes

Detailed Description

Constructor & Destructor Documentation

◆ __init__()

Member Function Documentation

◆ checkDataStringLimits()

◆ extractAuthorName()

◆ getDefaultProperties()

◆ getPairNames()

◆ isGoodWord()

◆ makeParsing()

◆ mergeProperties()

◆ parse()

◆ removeNoneAlpha()

Member Data Documentation

◆ author

◆ CLEAN_NONE_ALPHA_DEFAULT_VALUE

◆ CLEAN_NONE_ALPHA_NAME

◆ ERROR_CONFIG_PROPERTY_TYPE

◆ ERROR_DATA_STRING_TYPE

◆ ERROR_MAIN_TAG_NAME

◆ ERROR_PROCESSOR_PROPERTY_TYPE

◆ MAIN_TAG_NAME

◆ MAX_BYTES_DEFAULT_VALUE

◆ MAX_BYTES_NAME

◆ MAX_CHARS_WORD_DEFAULT_VALUE

◆ MAX_CHARS_WORD_NAME

◆ MAX_WORDS_DEFAULT_VALUE

◆ MAX_WORDS_NAME

◆ MIN_BYTES_DEFAULT_VALUE

◆ MIN_BYTES_NAME

◆ MIN_WORDS_DEFAULT_VALUE

◆ MIN_WORDS_NAME

◆ MISMATCH_DEFAULT_VALUE

◆ MISMATCH_NAME

◆ MISMATCH_VALUE_EMPTY

◆ MISMATCH_VALUE_IGNORE

◆ MISMATCH_VALUE_PARSE

◆ MISMATCH_VALUE_VALUE

◆ UNDETECTED_DEFAULT_VALUE

◆ UNDETECTED_NAME

◆ UNDETECTED_VALUE_EMPTY

◆ UNDETECTED_VALUE_IGNORE

◆ UNDETECTED_VALUE_VALUE

◆ VALUE_NAME

◆ init()