|
def | langDetect (incomeBuf, convertToFullName=True) |
|
def | commonSplitMethod (incomeBuf, minWLen) |
|
def | hashCalculateSimple (incomeBuf, minWLen) |
|
def | hashCalculateSnowball (incomeBuf, minWLen, additionData, stemmerLang) |
|
def | hashCalculateSoundex (incomeBuf, minWLen) |
|
def | hashCalculate (incomeBuf, algo, minWLen=3, additionData=None, stemmerLang=None) |
|
Definition at line 20 of file ContentHashCalculator.py.
◆ commonSplitMethod()
def app.ContentHashCalculator.ContentHashCalculator.commonSplitMethod |
( |
|
incomeBuf, |
|
|
|
minWLen |
|
) |
| |
|
static |
Definition at line 49 of file ContentHashCalculator.py.
49 def commonSplitMethod(incomeBuf, minWLen):
50 ret = re.split(ContentHashCalculator.RE_SPLITTER, incomeBuf, flags=re.LOCALE)
55 ret = [x
for x
in ret
if len(x) >= minWLen]
◆ hashCalculate()
def app.ContentHashCalculator.ContentHashCalculator.hashCalculate |
( |
|
incomeBuf, |
|
|
|
algo, |
|
|
|
minWLen = 3 , |
|
|
|
additionData = None , |
|
|
|
stemmerLang = None |
|
) |
| |
|
static |
Definition at line 118 of file ContentHashCalculator.py.
118 def hashCalculate(incomeBuf, algo, minWLen=3, additionData=None, stemmerLang=None):
120 incomeBuf = incomeBuf.lower()
121 if algo == ContentHashCalculator.ALGO_INCOME_BUF:
122 ret = hashlib.md5(incomeBuf).hexdigest()
123 elif algo == ContentHashCalculator.ALGO_SIMPLE_SPLITTING:
124 ret = ContentHashCalculator.hashCalculateSimple(incomeBuf, minWLen)
125 elif algo == ContentHashCalculator.ALGO_SNOWBALL_SPLITTING:
126 ret = ContentHashCalculator.hashCalculateSnowball(incomeBuf, minWLen, additionData, stemmerLang)
127 elif algo == ContentHashCalculator.ALGO_SOUNDEX_SPLITTING:
128 ret = ContentHashCalculator.hashCalculateSoundex(incomeBuf, minWLen)
◆ hashCalculateSimple()
def app.ContentHashCalculator.ContentHashCalculator.hashCalculateSimple |
( |
|
incomeBuf, |
|
|
|
minWLen |
|
) |
| |
|
static |
Definition at line 61 of file ContentHashCalculator.py.
61 def hashCalculateSimple(incomeBuf, minWLen):
64 splittedList = ContentHashCalculator.commonSplitMethod(incomeBuf, minWLen)
65 if len(splittedList) > 0:
66 ret = hashlib.md5(
''.
join(splittedList)).hexdigest()
◆ hashCalculateSnowball()
def app.ContentHashCalculator.ContentHashCalculator.hashCalculateSnowball |
( |
|
incomeBuf, |
|
|
|
minWLen, |
|
|
|
additionData, |
|
|
|
stemmerLang |
|
) |
| |
|
static |
Definition at line 72 of file ContentHashCalculator.py.
72 def hashCalculateSnowball(incomeBuf, minWLen, additionData, stemmerLang):
75 import snowballstemmer
76 if stemmerLang
is None:
77 stemmerLang = ContentHashCalculator.langDetect(incomeBuf)
78 if stemmerLang
is None:
79 stemmerLang =
'english' 82 splittedList = ContentHashCalculator.commonSplitMethod(incomeBuf, minWLen)
83 if len(splittedList) > 0:
84 stemmer = snowballstemmer.stemmer(stemmerLang)
85 for i
in xrange(0, len(splittedList)):
87 splittedList[i] = stemmer.stemWord(splittedList)
88 except Exception
as ecxp:
90 ret = hashlib.md5(
''.
join(splittedList)).hexdigest()
91 except Exception
as ecxp:
92 logger.debug(
">>> Some snowball exception = " + str(ecxp))
◆ hashCalculateSoundex()
def app.ContentHashCalculator.ContentHashCalculator.hashCalculateSoundex |
( |
|
incomeBuf, |
|
|
|
minWLen |
|
) |
| |
|
static |
Definition at line 98 of file ContentHashCalculator.py.
98 def hashCalculateSoundex(incomeBuf, minWLen):
102 splittedList = ContentHashCalculator.commonSplitMethod(incomeBuf, minWLen)
103 if len(splittedList) > 0:
104 s = soundex.getInstance()
105 for i
in xrange(0, len(splittedList)):
107 splittedList[i] = s.soundex(splittedList[i])
108 except Exception
as ecxp:
110 ret = hashlib.md5(
''.
join(splittedList)).hexdigest()
111 except Exception
as ecxp:
112 logger.debug(
">>> Some soundex exception = " + str(ecxp))
◆ langDetect()
def app.ContentHashCalculator.ContentHashCalculator.langDetect |
( |
|
incomeBuf, |
|
|
|
convertToFullName = True |
|
) |
| |
|
static |
Definition at line 31 of file ContentHashCalculator.py.
31 def langDetect(incomeBuf, convertToFullName=True):
35 from langdetect
import detect
36 langSmallName = detect(incomeBuf).split(
'-')[0]
39 ret = pycountry.languages.get(iso639_1_code=langSmallName).name.lower()
42 except Exception
as ecxp:
43 logger.debug(
">>> Some snowball exception = " + str(ecxp))
◆ ALGO_INCOME_BUF
int app.ContentHashCalculator.ContentHashCalculator.ALGO_INCOME_BUF = 1 |
|
static |
◆ ALGO_SIMPLE_SPLITTING
int app.ContentHashCalculator.ContentHashCalculator.ALGO_SIMPLE_SPLITTING = 2 |
|
static |
◆ ALGO_SNOWBALL_SPLITTING
int app.ContentHashCalculator.ContentHashCalculator.ALGO_SNOWBALL_SPLITTING = 3 |
|
static |
◆ ALGO_SOUNDEX_SPLITTING
int app.ContentHashCalculator.ContentHashCalculator.ALGO_SOUNDEX_SPLITTING = 4 |
|
static |
◆ RE_SPLITTER
string app.ContentHashCalculator.ContentHashCalculator.RE_SPLITTER = r'\s' |
|
static |
The documentation for this class was generated from the following file: