HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
integrity-urlContent.py
Go to the documentation of this file.
1 #!/usr/bin/python
2 
3 
4 '''
5 HCE project, Python bindings, Distributed Crawler application.
6 Applied utility to check sets of URLs from nodes in URLContent request results.
7 
8 @package: dc
9 @author bgv bgv.hce@gmail.com
10 @link: http://hierarchical-cluster-engine.com/
11 @copyright: Copyright © 2014 IOIX Ukraine
12 @license: http://hierarchical-cluster-engine.com/license/
13 @since: 0.1
14 '''
15 
16 
17 import ppath
18 from ppath import sys
19 
20 import hashlib
21 from optparse import OptionParser
22 
23 import json
24 import base64
25 
26 parser = OptionParser()
27 parser.add_option("-s", "--stat", type="string",
28  help="include stat data", dest="resStat")
29 
30 
31 if __name__ == "__main__":
32  options, arguments = parser.parse_args()
33 
34  if options.__dict__["resStat"] is not None:
35  statFlag = int(options.__dict__["resStat"])
36  else:
37  statFlag = 0
38 
39  jsonString = sys.stdin.read()
40 
41  if jsonString is not None and jsonString != "":
42  try:
43  #jsonDic = json.loads(jsonString).decode('utf-8')
44  jsonDic = json.loads(str(jsonString))
45 
46  retVal = {"nodesCount":0, "totalItems":0, "uniqueItems":0, "totalDuplicated":0, "uniqueDuplicated":0,
47  "duplicatedFreqsList":{}, "totalItemsList":{}, "statusesFreqList":{}, "resStat":None, "urlStat":None}
48 
49  retVal["nodesCount"] = len(jsonDic["itemsList"])
50 
51  for i in xrange(1, retVal["nodesCount"] + 1):
52  retVal["duplicatedFreqsList"][str(i)] = {}
53 
54  idsDict = {}
55  urlsDict = {}
56  for itemsList in jsonDic["itemsList"]:
57  retVal["totalItemsList"][itemsList["host"]] = len(itemsList["itemObject"])
58  retVal["totalItems"] = retVal["totalItems"] + len(itemsList["itemObject"])
59  idsDict[itemsList["host"]] = {}
60  for itemObject in itemsList["itemObject"]:
61  idsDict[itemsList["host"]][itemObject["urlMd5"]] = {"freq":0, "url":itemObject["url"],
62  "status":itemObject["status"]}
63  if itemObject["url"] in urlsDict:
64  urlsDict[itemObject["url"]].append([itemsList["host"], itemObject["status"]])
65  else:
66  urlsDict[itemObject["url"]] = [[itemsList["host"], itemObject["status"]]]
67  if str(itemObject["status"]) in retVal["statusesFreqList"]:
68  retVal["statusesFreqList"][str(itemObject["status"])] = retVal["statusesFreqList"][str(itemObject["status"])] + 1
69  else:
70  retVal["statusesFreqList"][str(itemObject["status"])] = 1
71 
72  idsDict2 = dict(idsDict)
73  for host, idDict in idsDict.iteritems():
74  for id, freqDict in idDict.iteritems():
75  for host1, idDict1 in idsDict.iteritems():
76  if id in idDict1:
77  idsDict2[host][id]["freq"] = idsDict2[host][id]["freq"] + 1
78 
79  for host, idDict in idsDict2.iteritems():
80  for id, freqDict in idDict.iteritems():
81  if freqDict["freq"] > 1:
82  retVal["totalDuplicated"] = retVal["totalDuplicated"] + 1
83  if id not in retVal["duplicatedFreqsList"][str(freqDict["freq"])]:
84  retVal["duplicatedFreqsList"][str(freqDict["freq"])][id] = [freqDict]
85  else:
86  retVal["duplicatedFreqsList"][str(freqDict["freq"])][id].append(freqDict)
87  #retVal["duplicatedFreqsList"][str(freqDict["freq"])][id] = host
88 
89  for freq, idsDict in retVal["duplicatedFreqsList"].iteritems():
90  if int(freq) == 1:
91  name = "uniqueItems"
92  else:
93  name = "uniqueDuplicated"
94  retVal[name] = retVal[name] + len(idsDict)
95  if statFlag == 0:
96  retVal["duplicatedFreqsList"][freq] = len(idsDict)
97 
98  if statFlag:
99  retVal["resStat"] = idsDict2
100  retVal["urlStat"] = urlsDict
101 
102  sys.stdout.write(json.dumps(retVal, indent=4, separators=(',', ': ')))
103 
104  except Exception, e:
105  sys.stdout.write("Json parsing or structure access error : " + e.message + "\n")
106  else:
107  sys.stdout.write("Input json is empty.\n")
108 
109  sys.stdout.flush()
110 
111 
112