5 HCE project, Python bindings, Distributed Crawler application. 6 Applied utility to check sets of URLs from nodes in URLContent request results. 9 @author bgv bgv.hce@gmail.com 10 @link: http://hierarchical-cluster-engine.com/ 11 @copyright: Copyright © 2014 IOIX Ukraine 12 @license: http://hierarchical-cluster-engine.com/license/ 21 from optparse
import OptionParser
26 parser = OptionParser()
27 parser.add_option(
"-s",
"--stat", type=
"string",
28 help=
"include stat data", dest=
"resStat")
31 if __name__ ==
"__main__":
32 options, arguments = parser.parse_args()
34 if options.__dict__[
"resStat"]
is not None:
35 statFlag = int(options.__dict__[
"resStat"])
39 jsonString = sys.stdin.read()
41 if jsonString
is not None and jsonString !=
"":
44 jsonDic = json.loads(str(jsonString))
46 retVal = {
"nodesCount":0,
"totalItems":0,
"uniqueItems":0,
"totalDuplicated":0,
"uniqueDuplicated":0,
47 "duplicatedFreqsList":{},
"totalItemsList":{},
"statusesFreqList":{},
"resStat":
None,
"urlStat":
None}
49 retVal[
"nodesCount"] = len(jsonDic[
"itemsList"])
51 for i
in xrange(1, retVal[
"nodesCount"] + 1):
52 retVal[
"duplicatedFreqsList"][str(i)] = {}
56 for itemsList
in jsonDic[
"itemsList"]:
57 retVal[
"totalItemsList"][itemsList[
"host"]] = len(itemsList[
"itemObject"])
58 retVal[
"totalItems"] = retVal[
"totalItems"] + len(itemsList[
"itemObject"])
59 idsDict[itemsList[
"host"]] = {}
60 for itemObject
in itemsList[
"itemObject"]:
61 idsDict[itemsList[
"host"]][itemObject[
"urlMd5"]] = {
"freq":0,
"url":itemObject[
"url"],
62 "status":itemObject[
"status"]}
63 if itemObject[
"url"]
in urlsDict:
64 urlsDict[itemObject[
"url"]].append([itemsList[
"host"], itemObject[
"status"]])
66 urlsDict[itemObject[
"url"]] = [[itemsList[
"host"], itemObject[
"status"]]]
67 if str(itemObject[
"status"])
in retVal[
"statusesFreqList"]:
68 retVal[
"statusesFreqList"][str(itemObject[
"status"])] = retVal[
"statusesFreqList"][str(itemObject[
"status"])] + 1
70 retVal[
"statusesFreqList"][str(itemObject[
"status"])] = 1
72 idsDict2 = dict(idsDict)
73 for host, idDict
in idsDict.iteritems():
74 for id, freqDict
in idDict.iteritems():
75 for host1, idDict1
in idsDict.iteritems():
77 idsDict2[host][id][
"freq"] = idsDict2[host][id][
"freq"] + 1
79 for host, idDict
in idsDict2.iteritems():
80 for id, freqDict
in idDict.iteritems():
81 if freqDict[
"freq"] > 1:
82 retVal[
"totalDuplicated"] = retVal[
"totalDuplicated"] + 1
83 if id
not in retVal[
"duplicatedFreqsList"][str(freqDict[
"freq"])]:
84 retVal[
"duplicatedFreqsList"][str(freqDict[
"freq"])][id] = [freqDict]
86 retVal[
"duplicatedFreqsList"][str(freqDict[
"freq"])][id].append(freqDict)
89 for freq, idsDict
in retVal[
"duplicatedFreqsList"].iteritems():
93 name =
"uniqueDuplicated" 94 retVal[name] = retVal[name] + len(idsDict)
96 retVal[
"duplicatedFreqsList"][freq] = len(idsDict)
99 retVal[
"resStat"] = idsDict2
100 retVal[
"urlStat"] = urlsDict
102 sys.stdout.write(json.dumps(retVal, indent=4, separators=(
',',
': ')))
105 sys.stdout.write(
"Json parsing or structure access error : " + e.message +
"\n")
107 sys.stdout.write(
"Input json is empty.\n")