HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
MetricWCount.py
Go to the documentation of this file.
1 # coding: utf-8 # pylint: disable-all
2 
3 """@package algorithms
4  @file MetricWCount.py
5  @author scorp <developers.hce@gmail.com>
6  @link http://hierarchical-cluster-engine.com/
7  @copyright Copyright &copy; 2013 IOIX Ukraine
8  @license http://hierarchical-cluster-engine.com/license/
9  @package HCE project node API
10  @since 0.1
11  """
12 
13 import logging
14 
15 import app.Consts as APP_CONSTS
16 import app.Utils as Utils # pylint: disable=F0401
17 import types
18 import re
19 import unicodedata
20 from BaseMetric import BaseMetric
21 
22 # Logger initialization
23 logger = Utils.MPLogger().getLogger()
24 
25 
26 # #The MetricContentSize class, class that implements metric counters for words count Metric
27 #
29 
30 
31  CHAR_CATEGORIES_LIST = ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nd', 'Nl', 'No']
32  CHAR_NOT_LATIN_LIST = ['Lt', 'Lm', 'Lo']
33  RE_SPLITTER = '\s'
34  MIN_LATIN_WORD_LEN = 3
35 
36  W_TYPE_LATIN = 0
37  W_TYPE_NOT_LATIN = 1
38  W_TYPE_NUMBER = 2
39  W_TYPE_BAD = 3
40 
41 
42  # # class constructor
43  #
44  # @param name - metric's name
45  def __init__(self, names):
46  super(MetricWCount, self).__init__(names)
47 
48 
49  # # internalCalculating methods makes internal content calculating
50  #
51  # @param dataDict
52  # @param buf
53  def internalCalculating(self, dataDict, buf):
54  if type(buf) is types.StringType:
55  buf = unicode(buf)
56  words = re.split(self.RE_SPLITTER, buf, flags=re.LOCALE)
57  for word in words:
58  wType = self.W_TYPE_LATIN
59  for ch in word:
60  chCategory = unicodedata.category(ch)
61  if chCategory in self.CHAR_CATEGORIES_LIST:
62  if chCategory in self.CHAR_NOT_LATIN_LIST:
63  wType = self.W_TYPE_NOT_LATIN
64  else:
65  wType = self.W_TYPE_BAD
66  break
67  if wType == self.W_TYPE_LATIN and len(word) < self.MIN_LATIN_WORD_LEN:
68  wType = self.W_TYPE_BAD
69  if wType != self.W_TYPE_BAD:
70  dataDict["validWordsCount"] += 1
71  dataDict["count"] += 1
72 
73 
74  # # precalculate makes words count metrics precalculating
75  #
76  # @param result - param, that content calculating data in common format
77  # @return precalculated data in common format
78  def precalculate(self, result, metricName):
79  ret = {"count": 0, "percent": 0, "validWordsCount": 0}
80  for key in result.tags:
81  if type(result.tags[key]) is types.DictType and "data" in result.tags[key]:
82  if type(result.tags[key]["data"]) in types.StringTypes:
83  self.internalCalculating(ret, result.tags[key]["data"])
84  elif type(result.tags[key]["data"]) is types.ListType:
85  for buf in result.tags[key]["data"]:
86  self.internalCalculating(ret, buf)
87  if ret["count"] > 0:
88  ret["percent"] = ret["validWordsCount"] * 100 / ret["count"]
89  ret = self.retForMultiNames(ret, metricName)
90  return ret
def internalCalculating(self, dataDict, buf)
Definition: MetricWCount.py:53
def precalculate(self, result, metricName)
Definition: MetricWCount.py:78
def retForMultiNames(self, retDict, metricName)
Definition: BaseMetric.py:39