2 Created on Sep 11, 2015 6 @link: http://hierarchical-cluster-engine.com/ 7 @copyright: Copyright © 2013-2014 IOIX Ukraine 8 @license: http://hierarchical-cluster-engine.com/license/ 23 ALGO_SIMPLE_SPLITTING = 2
24 ALGO_SNOWBALL_SPLITTING = 3
25 ALGO_SOUNDEX_SPLITTING = 4
35 from langdetect
import detect
36 langSmallName = detect(incomeBuf).split(
'-')[0]
39 ret = pycountry.languages.get(iso639_1_code=langSmallName).name.lower()
42 except Exception
as ecxp:
43 logger.debug(
">>> Some snowball exception = " + str(ecxp))
50 ret = re.split(ContentHashCalculator.RE_SPLITTER, incomeBuf, flags=re.LOCALE)
55 ret = [x
for x
in ret
if len(x) >= minWLen]
64 splittedList = ContentHashCalculator.commonSplitMethod(incomeBuf, minWLen)
65 if len(splittedList) > 0:
66 ret = hashlib.md5(
''.
join(splittedList)).hexdigest()
75 import snowballstemmer
76 if stemmerLang
is None:
77 stemmerLang = ContentHashCalculator.langDetect(incomeBuf)
78 if stemmerLang
is None:
79 stemmerLang =
'english' 82 splittedList = ContentHashCalculator.commonSplitMethod(incomeBuf, minWLen)
83 if len(splittedList) > 0:
84 stemmer = snowballstemmer.stemmer(stemmerLang)
85 for i
in xrange(0, len(splittedList)):
87 splittedList[i] = stemmer.stemWord(splittedList)
88 except Exception
as ecxp:
90 ret = hashlib.md5(
''.
join(splittedList)).hexdigest()
91 except Exception
as ecxp:
92 logger.debug(
">>> Some snowball exception = " + str(ecxp))
102 splittedList = ContentHashCalculator.commonSplitMethod(incomeBuf, minWLen)
103 if len(splittedList) > 0:
104 s = soundex.getInstance()
105 for i
in xrange(0, len(splittedList)):
107 splittedList[i] = s.soundex(splittedList[i])
108 except Exception
as ecxp:
110 ret = hashlib.md5(
''.
join(splittedList)).hexdigest()
111 except Exception
as ecxp:
112 logger.debug(
">>> Some soundex exception = " + str(ecxp))
118 def hashCalculate(incomeBuf, algo, minWLen=3, additionData=None, stemmerLang=None):
120 incomeBuf = incomeBuf.lower()
121 if algo == ContentHashCalculator.ALGO_INCOME_BUF:
122 ret = hashlib.md5(incomeBuf).hexdigest()
123 elif algo == ContentHashCalculator.ALGO_SIMPLE_SPLITTING:
124 ret = ContentHashCalculator.hashCalculateSimple(incomeBuf, minWLen)
125 elif algo == ContentHashCalculator.ALGO_SNOWBALL_SPLITTING:
126 ret = ContentHashCalculator.hashCalculateSnowball(incomeBuf, minWLen, additionData, stemmerLang)
127 elif algo == ContentHashCalculator.ALGO_SOUNDEX_SPLITTING:
128 ret = ContentHashCalculator.hashCalculateSoundex(incomeBuf, minWLen)
def langDetect(incomeBuf, convertToFullName=True)
def hashCalculateSoundex(incomeBuf, minWLen)
def commonSplitMethod(incomeBuf, minWLen)
def hashCalculateSimple(incomeBuf, minWLen)
def hashCalculateSnowball(incomeBuf, minWLen, additionData, stemmerLang)
def hashCalculate(incomeBuf, algo, minWLen=3, additionData=None, stemmerLang=None)