HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
Go to the documentation of this file.
1 '''
2 Created on Mar 17, 2015
4 @package: app
5 @author: scorp
6 @link: http://hierarchical-cluster-engine.com/
7 @copyright: Copyright © 2014-2016 IOIX Ukraine
8 @license: http://hierarchical-cluster-engine.com/license/
9 @since: 0.1
10 '''
12 import json
13 import os
15 import app.Utils as Utils # pylint: disable=F0401
21 # # LFSDataStorage Class, implements functional of common crawler data storage
22 #
23 class LFSDataStorage(object):
25  JSON_EXTENSION = ".json"
28  # #Class's constructor
29  #
30  def __init__(self):
31  self.storeDict = {}
34  # #Method saveElement saves incoming elements in the storage file that sets in storageDir and domain
35  #
36  # @param storageDir - root storage dir
37  # @param domain - concretes files subputh
38  # @param siteId - site's id, that concretes file name
39  # @param element - data to save
40  def saveElement(self, storageDir, domain, siteId, element):
41  jsonStr = json.dumps(element, indent=4)
42  if jsonStr is not None and jsonStr != "":
43  if os.path.isdir(storageDir):
44  localDir = storageDir
45  if localDir[-1] != "/":
46  localDir += "/"
47  localDir += domain
48  if not os.path.isdir(localDir):
49  try:
50  os.makedirs(localDir)
51  except Exception:
52  pass
53  localFileName = localDir
54  localFileName += "/"
55  localFileName += str(siteId)
56  localFileName += self.JSON_EXTENSION
57  try:
58  fd = open(localFileName, "w")
59  fd.write(jsonStr)
60  fd.close()
61  except IOError:
62  logger.debug(">>> LFSDataStorage.saveElement can't open file to write, file=" + localFileName)
63  else:
64  logger.debug(">>> LFSDataStorage.saveElement can't find root dir, dir=" + storageDir)
67  # #Method loadHeaders reads headers data from storage file
68  #
69  # @param storageDir - root storage dir
70  # @param domain - concretes files subputh
71  # @param siteId - site's id, that concretes file name
72  # @param externalElement - incoming headers that will mix with reading data
73  # @param readFromFS - bool value - get element from internal cache or not
74  # @return site storage element
75  def loadElement(self, storageDir, host, siteId, externalElement=None, readFromFS=False):
76  nodeElem = None
77  if not readFromFS and host in self.storeDict and siteId in self.storeDict[host]:
78  nodeElem = self.storeDict[host][siteId]
79  if nodeElem is None:
80  if not os.path.isdir(storageDir):
81  os.makedirs(storageDir)
83  if os.path.isdir(storageDir):
84  localDir = storageDir
85  if localDir[-1] != "/":
86  localDir += "/"
87  localDir += host
88  if os.path.isdir(localDir):
89  localFileName = localDir
90  localFileName += "/"
91  localFileName += str(siteId)
92  localFileName += self.JSON_EXTENSION
93  try:
94  fd = open(localFileName, "r")
95  fileBuf = fd.read()
96  if fileBuf is not None and fileBuf != "":
97  nodeElem = json.loads(fileBuf)
98  fd.close()
99  except IOError:
100  logger.debug(">>> LFSDataStorage.loadElement can't open file to read, file=" + str(localFileName))
101  except Exception as exp:
102  logger.debug(">>> LFSDataStorage.loadElement some exception, = " + str(exp))
103  else:
104  logger.debug(">>> LFSDataStorage.loadElement can't find storage dir, dir=" + str(localDir))
105  else:
106  logger.debug(">>> LFSDataStorage.loadElement can't find root dir, dir=" + str(storageDir))
107  # save nodeElem in class storage hash
108  if nodeElem is not None:
109  if host not in self.storeDict:
110  self.storeDict[host] = {}
111  self.storeDict[host][siteId] = nodeElem
112  if externalElement is not None:
113  if nodeElem is None:
114  nodeElem = {}
115  for headerKey in externalElement:
116  if headerKey not in nodeElem:
117  nodeElem[headerKey] = {}
118  for valueKey in externalElement[headerKey]:
119  if valueKey not in nodeElem[headerKey]:
120  nodeElem[headerKey][valueKey] = 0
121  return nodeElem
124  # #Method returns list of tuples of name from fileStorageHeaders with least freq and
125  # which is present in the siteStorageHeaders
126  #
127  # @param fileStorageElements - incoming items with file storage structure
128  # @param siteStorageHeaders - the same for the site storage values
129  # @param fileCacheOnly - use only elements from a fileStorageElements as from cache
130  # @return optimized low frequency list of name and value tuples
131  def fetchLowFreqHeaders(self, fileStorageElements, siteStorageElements=None, fileCacheOnly=False):
132  ret = []
134  if isinstance(fileStorageElements, dict):
135  for headerKey in fileStorageElements:
136  minValue = None
137  t = None
138  for valueKey in fileStorageElements[headerKey]:
139  if (minValue is None or minValue > fileStorageElements[headerKey][valueKey]) and \
140  (\
141  # siteStorageElements is None or \
142  siteStorageElements is not None and
143  (headerKey in siteStorageElements and isinstance(siteStorageElements[headerKey], list) and \
144  valueKey in siteStorageElements[headerKey])):
145  minValue = fileStorageElements[headerKey][valueKey]
146  t = tuple([headerKey, valueKey])
147  elif (minValue is None or minValue > fileStorageElements[headerKey][valueKey]) and fileCacheOnly is True:
148  t = tuple([headerKey, valueKey])
149  if t is not None:
150  ret.append(t)
152  return ret
155  # #Method converts incoming jsonBuf to the siteStorageElements dict and return them
156  #
157  # @param jsonBuf - incoming json string with siteStorageElements structure
158  # @return siteStorageElements element
159  def extractSiteStorageElement(self, jsonBuf):
160  ret = None
161  try:
162  ret = json.loads(jsonBuf)
163  except Exception as exp:
164  logger.debug(">>> LFSDataStorage.extractSiteStorageElement can't load data from incoming jsonBuf " +
165  "(may be not json format...) exception=" + str(exp))
166  return ret
def fetchLowFreqHeaders(self, fileStorageElements, siteStorageElements=None, fileCacheOnly=False)
def extractSiteStorageElement(self, jsonBuf)
def saveElement(self, storageDir, domain, siteId, element)
def loadElement(self, storageDir, host, siteId, externalElement=None, readFromFS=False)