HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
ContentHashCalculator.py
Go to the documentation of this file.
1 '''
2 Created on Sep 11, 2015
3 
4 @package: app
5 @author: scorp
6 @link: http://hierarchical-cluster-engine.com/
7 @copyright: Copyright © 2013-2014 IOIX Ukraine
8 @license: http://hierarchical-cluster-engine.com/license/
9 @since: 0.1
10 '''
11 
12 import hashlib
13 import re
14 import app.Utils as Utils # pylint: disable=F0401
15 
16 
18 
19 
20 class ContentHashCalculator(object):
21 
22  ALGO_INCOME_BUF = 1
23  ALGO_SIMPLE_SPLITTING = 2
24  ALGO_SNOWBALL_SPLITTING = 3
25  ALGO_SOUNDEX_SPLITTING = 4
26 
27  RE_SPLITTER = r'\s'
28 
29 
30  @staticmethod
31  def langDetect(incomeBuf, convertToFullName=True):
32  ret = None
33 
34  try:
35  from langdetect import detect
36  langSmallName = detect(incomeBuf).split('-')[0]
37  if convertToFullName:
38  import pycountry
39  ret = pycountry.languages.get(iso639_1_code=langSmallName).name.lower()
40  else:
41  ret = langSmallName
42  except Exception as ecxp:
43  logger.debug(">>> Some snowball exception = " + str(ecxp))
44 
45  return ret
46 
47 
48  @staticmethod
49  def commonSplitMethod(incomeBuf, minWLen):
50  ret = re.split(ContentHashCalculator.RE_SPLITTER, incomeBuf, flags=re.LOCALE)
51 
52  if len(ret) > 0:
53  ret = list(set(ret))
54  ret.sort()
55  ret = [x for x in ret if len(x) >= minWLen]
56 
57  return ret
58 
59 
60  @staticmethod
61  def hashCalculateSimple(incomeBuf, minWLen):
62  ret = None
63 
64  splittedList = ContentHashCalculator.commonSplitMethod(incomeBuf, minWLen)
65  if len(splittedList) > 0:
66  ret = hashlib.md5(''.join(splittedList)).hexdigest()
67 
68  return ret
69 
70 
71  @staticmethod
72  def hashCalculateSnowball(incomeBuf, minWLen, additionData, stemmerLang): # pylint: disable=W0613
73  ret = None
74  try:
75  import snowballstemmer
76  if stemmerLang is None:
77  stemmerLang = ContentHashCalculator.langDetect(incomeBuf)
78  if stemmerLang is None:
79  stemmerLang = 'english'
80 
81  # if additionData is not None and type(additionData) in types.StringTypes:
82  splittedList = ContentHashCalculator.commonSplitMethod(incomeBuf, minWLen)
83  if len(splittedList) > 0:
84  stemmer = snowballstemmer.stemmer(stemmerLang)
85  for i in xrange(0, len(splittedList)):
86  try:
87  splittedList[i] = stemmer.stemWord(splittedList)
88  except Exception as ecxp:
89  splittedList[i] = ""
90  ret = hashlib.md5(''.join(splittedList)).hexdigest()
91  except Exception as ecxp:
92  logger.debug(">>> Some snowball exception = " + str(ecxp))
93 
94  return ret
95 
96 
97  @staticmethod
98  def hashCalculateSoundex(incomeBuf, minWLen):
99  ret = None
100  try:
101  import soundex
102  splittedList = ContentHashCalculator.commonSplitMethod(incomeBuf, minWLen)
103  if len(splittedList) > 0:
104  s = soundex.getInstance()
105  for i in xrange(0, len(splittedList)):
106  try:
107  splittedList[i] = s.soundex(splittedList[i])
108  except Exception as ecxp:
109  splittedList[i] = ""
110  ret = hashlib.md5(''.join(splittedList)).hexdigest()
111  except Exception as ecxp:
112  logger.debug(">>> Some soundex exception = " + str(ecxp))
113 
114  return ret
115 
116 
117  @staticmethod
118  def hashCalculate(incomeBuf, algo, minWLen=3, additionData=None, stemmerLang=None):
119  ret = None
120  incomeBuf = incomeBuf.lower()
121  if algo == ContentHashCalculator.ALGO_INCOME_BUF:
122  ret = hashlib.md5(incomeBuf).hexdigest()
123  elif algo == ContentHashCalculator.ALGO_SIMPLE_SPLITTING:
124  ret = ContentHashCalculator.hashCalculateSimple(incomeBuf, minWLen)
125  elif algo == ContentHashCalculator.ALGO_SNOWBALL_SPLITTING:
126  ret = ContentHashCalculator.hashCalculateSnowball(incomeBuf, minWLen, additionData, stemmerLang)
127  elif algo == ContentHashCalculator.ALGO_SOUNDEX_SPLITTING:
128  ret = ContentHashCalculator.hashCalculateSoundex(incomeBuf, minWLen)
129 
130  return ret
131 
def langDetect(incomeBuf, convertToFullName=True)
def hashCalculateSnowball(incomeBuf, minWLen, additionData, stemmerLang)
def hashCalculate(incomeBuf, algo, minWLen=3, additionData=None, stemmerLang=None)
Definition: join.py:1