HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
app.ContentHashCalculator.ContentHashCalculator Class Reference
Inheritance diagram for app.ContentHashCalculator.ContentHashCalculator:
Collaboration diagram for app.ContentHashCalculator.ContentHashCalculator:

Static Public Member Functions

def langDetect (incomeBuf, convertToFullName=True)
 
def commonSplitMethod (incomeBuf, minWLen)
 
def hashCalculateSimple (incomeBuf, minWLen)
 
def hashCalculateSnowball (incomeBuf, minWLen, additionData, stemmerLang)
 
def hashCalculateSoundex (incomeBuf, minWLen)
 
def hashCalculate (incomeBuf, algo, minWLen=3, additionData=None, stemmerLang=None)
 

Static Public Attributes

int ALGO_INCOME_BUF = 1
 
int ALGO_SIMPLE_SPLITTING = 2
 
int ALGO_SNOWBALL_SPLITTING = 3
 
int ALGO_SOUNDEX_SPLITTING = 4
 
string RE_SPLITTER = r'\s'
 

Detailed Description

Definition at line 20 of file ContentHashCalculator.py.

Member Function Documentation

◆ commonSplitMethod()

def app.ContentHashCalculator.ContentHashCalculator.commonSplitMethod (   incomeBuf,
  minWLen 
)
static

Definition at line 49 of file ContentHashCalculator.py.

49  def commonSplitMethod(incomeBuf, minWLen):
50  ret = re.split(ContentHashCalculator.RE_SPLITTER, incomeBuf, flags=re.LOCALE)
51 
52  if len(ret) > 0:
53  ret = list(set(ret))
54  ret.sort()
55  ret = [x for x in ret if len(x) >= minWLen]
56 
57  return ret
58 
59 

◆ hashCalculate()

def app.ContentHashCalculator.ContentHashCalculator.hashCalculate (   incomeBuf,
  algo,
  minWLen = 3,
  additionData = None,
  stemmerLang = None 
)
static

Definition at line 118 of file ContentHashCalculator.py.

118  def hashCalculate(incomeBuf, algo, minWLen=3, additionData=None, stemmerLang=None):
119  ret = None
120  incomeBuf = incomeBuf.lower()
121  if algo == ContentHashCalculator.ALGO_INCOME_BUF:
122  ret = hashlib.md5(incomeBuf).hexdigest()
123  elif algo == ContentHashCalculator.ALGO_SIMPLE_SPLITTING:
124  ret = ContentHashCalculator.hashCalculateSimple(incomeBuf, minWLen)
125  elif algo == ContentHashCalculator.ALGO_SNOWBALL_SPLITTING:
126  ret = ContentHashCalculator.hashCalculateSnowball(incomeBuf, minWLen, additionData, stemmerLang)
127  elif algo == ContentHashCalculator.ALGO_SOUNDEX_SPLITTING:
128  ret = ContentHashCalculator.hashCalculateSoundex(incomeBuf, minWLen)
129 
130  return ret
131 
132 

◆ hashCalculateSimple()

def app.ContentHashCalculator.ContentHashCalculator.hashCalculateSimple (   incomeBuf,
  minWLen 
)
static

Definition at line 61 of file ContentHashCalculator.py.

61  def hashCalculateSimple(incomeBuf, minWLen):
62  ret = None
63 
64  splittedList = ContentHashCalculator.commonSplitMethod(incomeBuf, minWLen)
65  if len(splittedList) > 0:
66  ret = hashlib.md5(''.join(splittedList)).hexdigest()
67 
68  return ret
69 
70 
Definition: join.py:1

◆ hashCalculateSnowball()

def app.ContentHashCalculator.ContentHashCalculator.hashCalculateSnowball (   incomeBuf,
  minWLen,
  additionData,
  stemmerLang 
)
static

Definition at line 72 of file ContentHashCalculator.py.

72  def hashCalculateSnowball(incomeBuf, minWLen, additionData, stemmerLang): # pylint: disable=W0613
73  ret = None
74  try:
75  import snowballstemmer
76  if stemmerLang is None:
77  stemmerLang = ContentHashCalculator.langDetect(incomeBuf)
78  if stemmerLang is None:
79  stemmerLang = 'english'
80 
81  # if additionData is not None and type(additionData) in types.StringTypes:
82  splittedList = ContentHashCalculator.commonSplitMethod(incomeBuf, minWLen)
83  if len(splittedList) > 0:
84  stemmer = snowballstemmer.stemmer(stemmerLang)
85  for i in xrange(0, len(splittedList)):
86  try:
87  splittedList[i] = stemmer.stemWord(splittedList)
88  except Exception as ecxp:
89  splittedList[i] = ""
90  ret = hashlib.md5(''.join(splittedList)).hexdigest()
91  except Exception as ecxp:
92  logger.debug(">>> Some snowball exception = " + str(ecxp))
93 
94  return ret
95 
96 
Definition: join.py:1

◆ hashCalculateSoundex()

def app.ContentHashCalculator.ContentHashCalculator.hashCalculateSoundex (   incomeBuf,
  minWLen 
)
static

Definition at line 98 of file ContentHashCalculator.py.

98  def hashCalculateSoundex(incomeBuf, minWLen):
99  ret = None
100  try:
101  import soundex
102  splittedList = ContentHashCalculator.commonSplitMethod(incomeBuf, minWLen)
103  if len(splittedList) > 0:
104  s = soundex.getInstance()
105  for i in xrange(0, len(splittedList)):
106  try:
107  splittedList[i] = s.soundex(splittedList[i])
108  except Exception as ecxp:
109  splittedList[i] = ""
110  ret = hashlib.md5(''.join(splittedList)).hexdigest()
111  except Exception as ecxp:
112  logger.debug(">>> Some soundex exception = " + str(ecxp))
113 
114  return ret
115 
116 
Definition: join.py:1

◆ langDetect()

def app.ContentHashCalculator.ContentHashCalculator.langDetect (   incomeBuf,
  convertToFullName = True 
)
static

Definition at line 31 of file ContentHashCalculator.py.

31  def langDetect(incomeBuf, convertToFullName=True):
32  ret = None
33 
34  try:
35  from langdetect import detect
36  langSmallName = detect(incomeBuf).split('-')[0]
37  if convertToFullName:
38  import pycountry
39  ret = pycountry.languages.get(iso639_1_code=langSmallName).name.lower()
40  else:
41  ret = langSmallName
42  except Exception as ecxp:
43  logger.debug(">>> Some snowball exception = " + str(ecxp))
44 
45  return ret
46 
47 

Member Data Documentation

◆ ALGO_INCOME_BUF

int app.ContentHashCalculator.ContentHashCalculator.ALGO_INCOME_BUF = 1
static

Definition at line 22 of file ContentHashCalculator.py.

◆ ALGO_SIMPLE_SPLITTING

int app.ContentHashCalculator.ContentHashCalculator.ALGO_SIMPLE_SPLITTING = 2
static

Definition at line 23 of file ContentHashCalculator.py.

◆ ALGO_SNOWBALL_SPLITTING

int app.ContentHashCalculator.ContentHashCalculator.ALGO_SNOWBALL_SPLITTING = 3
static

Definition at line 24 of file ContentHashCalculator.py.

◆ ALGO_SOUNDEX_SPLITTING

int app.ContentHashCalculator.ContentHashCalculator.ALGO_SOUNDEX_SPLITTING = 4
static

Definition at line 25 of file ContentHashCalculator.py.

◆ RE_SPLITTER

string app.ContentHashCalculator.ContentHashCalculator.RE_SPLITTER = r'\s'
static

Definition at line 27 of file ContentHashCalculator.py.


The documentation for this class was generated from the following file: