HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
SourceTemplateExtractor.py
Go to the documentation of this file.
1 '''
2 @package: dc
3 @author scorp
4 @file SourceTemplateExtractor.py
5 @link: http://hierarchical-cluster-engine.com/
6 @copyright: Copyright © 2013-2014 IOIX Ukraine
7 @license: http://hierarchical-cluster-engine.com/license/
8 @since: 0.1
9 '''
10 
11 import json
12 import types
13 import hashlib
14 import datetime
15 import requests
16 import app.Utils as Utils # pylint: disable=F0401
17 
18 # Logger initialization
19 logger = Utils.MPLogger().getLogger()
20 
21 # #SourceTemplateExtractor class implements templates exctracting from specified template sources
22 #
24 
25 
26  SOURCE_NAME_FILE = "file"
27  SOURCE_NAME_HTTP = "http"
28  POST_BUFF_MACROS = ["RAW_CONTENT", "URL"]
29 
30 
31  # #Class Constructor
32  #
33  def __init__(self):
34  self.TemplHash = {}
35  self.macroDict = {}
36 
37 
38  # # scheduleCalc method calculates schedule conditions
39  #
40  # @param scheduler - incoming scheduler json structure
41  # @param additionData - addition data, used in templates source conditions
42  # @return bool value, that means use surrent template source or not use
43  def scheduleCalc(self, schedule, additionData):
44  ret = False
45  scheduleStorageData = None
46  curdatetime = datetime.datetime.now()
47  if "file" in schedule:
48  with open(schedule["file"], "r") as fd:
49  scheduleStorageData = json.loads(fd.read())
50  if schedule["type"] == 0:
51  if additionData["parentMD5"] == "":
52  ret = True
53  elif schedule["type"] == 1:
54  ret = True
55  elif schedule["type"] == 2:
56  if scheduleStorageData is not None:
57  atTime = datetime.datetime.strptime(schedule["at"], "%Y-%m-%d %H:%M")
58  saveAtTime = None
59  if "saveAtTime" in scheduleStorageData and scheduleStorageData["saveAtTime"] is not None:
60  saveAtTime = datetime.datetime.strptime(scheduleStorageData["saveAtTime"], "%Y-%m-%d %H:%M")
61  if saveAtTime != atTime:
62  scheduleStorageData["tCount"] = 0
63  scheduleStorageData["saveAtTime"] = atTime.strftime("%Y-%m-%d %H:%M")
64  if curdatetime > atTime:
65  if scheduleStorageData["tCount"] == 0:
66  ret = True
67  scheduleStorageData["tCount"] += 1
68  elif schedule["type"] == 3:
69  if scheduleStorageData is not None:
70  if "saveNowTime" in scheduleStorageData and scheduleStorageData["saveNowTime"] is not None:
71  atTime = datetime.datetime.strptime(scheduleStorageData["saveNowTime"], "%Y-%m-%d %H:%M")
72  else:
73  atTime = datetime.datetime.strptime(schedule["at"], "%Y-%m-%d %H:%M")
74 
75  if curdatetime > (atTime + datetime.timedelta(minutes=int(schedule["step"]))):
76  scheduleStorageData["saveNowTime"] = curdatetime.strftime("%Y-%m-%d %H:%M")
77  ret = True
78  if scheduleStorageData is not None:
79  scheduleStorageData["datetime"] = curdatetime.strftime("%Y-%m-%d %H:%M")
80  with open(schedule["file"], "w") as fd:
81  fd.write(json.dumps(scheduleStorageData))
82  return ret
83 
84 
85  # # loadTemplateFromSource main public/process class method
86  #
87  # @param templateSource - incoming templateSource data
88  # @param additionData - addition data, used in templates source conditions
89  # @param rawContent - incoming resource's rawContent
90  # @return new fetched templates
91  def loadTemplateFromSource(self, templateSource, additionData=None, rawContent=None, url=None):
92  ret = []
93  self.macroDict = {}
94  if rawContent is not None:
95  self.macroDict["RAW_CONTENT"] = rawContent
96  if url is not None:
97  self.macroDict["URL"] = url
98  templateSourceStruct = None
99  try:
100  templateSourceStruct = json.loads(templateSource)
101  except Exception as excp:
102  logger.debug(">>> Wrong while json loads from templateSource; err=" + str(excp))
103  # if templateSourceStruct is not None and type(templateSourceStruct) is types.ListType:
104  if templateSourceStruct is not None and isinstance(templateSourceStruct, types.ListType):
105  for templateSourceElement in templateSourceStruct:
106  addedElement = None
107  try:
108  if "schedule" in templateSourceElement and templateSourceElement["schedule"] is not None and \
109  self.scheduleCalc(templateSourceElement["schedule"], additionData):
110  if templateSourceElement["source"] == SourceTemplateExtractor.SOURCE_NAME_FILE:
111  with open(templateSourceElement["request"], "rb") as fd:
112  addedElement = json.loads(fd.read())
113  elif templateSourceElement["source"] == SourceTemplateExtractor.SOURCE_NAME_HTTP:
114  addedElement = self.resolveTemplateByHTTP(templateSourceElement)
115 
116  if addedElement is not None:
117  if isinstance(addedElement, types.ListType) and len(addedElement) > 0:
118  ret.append(addedElement[0])
119  elif isinstance(addedElement, types.DictType):
120  ret.append(addedElement)
121  except Exception as excp:
122  logger.debug(">>> Something wrong with templateSourceElement procession; err=" + str(excp))
123  return ret
124 
125 
126  # # resolveTemplateByHTTP method fetches by HTTP request and returns one or list template elements
127  #
128  # @param templateSourceElement - incoming template source element
129  # @param rawContent - incoming resource's rawContent
130  # @return one or list template elements
131  def resolveTemplateByHTTP(self, templateSourceElement):
132  ret = None
133  requestString = None
134  contentTypeHeader = None
135  if "headers" in templateSourceElement:
136  contentTypeHeader = json.loads(templateSourceElement["headers"]) # {"Content-Type": "application/json"}
137  if templateSourceElement["request"].startswith("http://") or \
138  templateSourceElement["request"].startswith("https://"):
139  requestString = templateSourceElement["request"]
140  else:
141  pass
142  if requestString is not None:
143  if templateSourceElement["post"] is None or templateSourceElement["post"] == "":
144  templateHash = hashlib.md5(requestString).hexdigest()
145  if templateHash in self.TemplHash:
146  ret = self.TemplHash[templateHash]
147  else:
148  ret = requests.get(requestString, headers=contentTypeHeader)
149  self.TemplHash[templateHash] = ret
150  else:
151  templateHash = hashlib.md5(requestString + templateSourceElement["post"]).hexdigest()
152  replacedPost = self.replacePostRawContent(templateSourceElement["post"])
153  if templateHash in self.TemplHash:
154  ret = self.TemplHash[templateHash]
155  else:
156  replacedPost = replacedPost.encode("utf-8")
157  logger.debug(">>> POST Data: requestString:\n" + str(requestString) + \
158  "\ntemplateSourceElement:\n" + str(templateSourceElement["post"]) + \
159  "\nreplacedPost:\n" + str(replacedPost) + "\nheaders:\n" + str(contentTypeHeader))
160  ret = requests.post(requestString, replacedPost, headers=contentTypeHeader)
161  self.TemplHash[templateHash] = ret
162  if ret is not None and ret.status_code == 200 and ret.text is not None:
163  ret = json.loads(ret.text)
164  else:
165  logger.debug(">>> Something wrong with HTTP request, Response code == " + str(ret.status_code) +
166  "content == " + str(ret.text))
167  return ret
168 
169 
170  # # replacePostRawContent method finds and replaces RAW_CONTENT_MACRO in POST data
171  #
172  # @param post incoming POST data
173  # @param rawContent - incoming resource's rawContent
174  # @return replaced POST data
175  def replacePostRawContent(self, post):
176  ret = post
177  for elem in self.POST_BUFF_MACROS:
178  if post.find("%" + elem + "%") >= 0:
179  if elem in self.macroDict:
180  ret = post.replace("%" + elem + "%", self.macroDict[elem])
181  else:
182  ret = post.replace("%" + elem + "%", "")
183  return ret
def loadTemplateFromSource(self, templateSource, additionData=None, rawContent=None, url=None)