HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
UrlSchema.py
Go to the documentation of this file.
1 # coding: utf-8
2 
3 """
4 @package: dc
5 @file UrlSchema.py
6 @author Scorp <developers.hce@gmail.com>
7 @link: http://hierarchical-cluster-engine.com/
8 @copyright: Copyright &copy; 2013-2014 IOIX Ukraine
9 @license: http://hierarchical-cluster-engine.com/license/
10 @since: 0.1
11 """
12 
13 import copy
14 import os
15 import json
16 import random
17 import string
18 from datetime import datetime
19 import urllib
20 import requests
21 import app.Consts as APP_CONSTS
22 from app.Utils import ExceptionLog
23 from app.Utils import varDump
24 import app.Utils as Utils # pylint: disable=F0401
25 
26 
27 logger = Utils.MPLogger().getLogger()
28 
29 
30 # # UrlSchema Class, implements functional UrlSchema custom generators
31 #
32 class UrlSchema(object):
33 
34  SCHEMA_DISABLE = 0
35  SCHEMA_PREDEFINED = 1
36  SCHEMA_INCREMENTAL_INT = 2
37  SCHEMA_RANDOM_INT = 3
38  SCHEMA_RANDOM_STR = 4
39 
40  CHAR_ASCII_LATIN = 0
41  CHAR_HEXADECIMAL = 1
42  CHAR_LOWER = 0
43  CHAR_UPPER = 1
44 
45  MODE_ONE_URL = 0
46  MODE_LIST_URLS = 1
47 
48  BATCH_INSERT_NO_ONE_ITEMS = 0
49  BATCH_INSERT_ALL_NEW_ITEMS = 1
50  BATCH_INSERT_ONLY_FIRST_ITEM = 2
51  BATCH_INSERT_DEFAULT = BATCH_INSERT_NO_ONE_ITEMS
52  BATCH_INSERT_MIN_ALLOWED_VALUE = BATCH_INSERT_NO_ONE_ITEMS
53  BATCH_INSERT_MAX_ALLOWED_VALUE = BATCH_INSERT_ONLY_FIRST_ITEM
54 
55  JSON_SUFF = ".json"
56  URL_SCHEMA_DATA_FILE_NAME_PREFIX = "url_schema_data_"
57 
58 
59  # #Class constructor
60  #
61  # @param schema - incoming schema in json format
62  def __init__(self, schema=None, siteId=None, urlSchemaDataDir=None):
64  self.externalError = APP_CONSTS.ERROR_OK
65  self.indexFileName = None
66  self.indexStruct = None
67  try:
68  # self.globalindex = None
69  self.schema = json.loads(schema)
70 
71  if isinstance(urlSchemaDataDir, basestring):
72  if not os.path.isdir(urlSchemaDataDir):
73  logger.debug("Create urlSchemaDataDir: %s", str(urlSchemaDataDir))
74  try:
75  os.makedirs(urlSchemaDataDir)
76  except OSError, err:
77  logger.debug("Creation of %s return error: %s", str(urlSchemaDataDir), str(err))
78 
79  if urlSchemaDataDir[-1] != '/':
80  urlSchemaDataDir += '/'
81  self.indexFileName = urlSchemaDataDir + self.URL_SCHEMA_DATA_FILE_NAME_PREFIX + str(siteId) + self.JSON_SUFF
82  if os.path.isfile(self.indexFileName):
83  self.indexStruct = self.readJsonFile(self.indexFileName)
84  logger.debug(">>> readJsonFile '" + str(self.indexFileName) + "' - SUCCESS")
85  else:
86  self.indexStruct = None
87 
88  except Exception as excp:
89  ExceptionLog.handler(logger, excp, ">>> UrlSchema wrong json loads")
90  self.schema = None
91 
92 
93  # #Method readJsonFile reads from file and return param index structure
94  #
95  # @param fileName - incoming file name
96  # @return json structure, just readed from file
97  def readJsonFile(self, fileName):
98  ret = {}
99  fd = None
100  try:
101  fd = open(fileName, "r")
102  if fd is not None:
103  ret = json.loads(fd.read()) # #.decode('utf-8').encode('latin-1', errors='ignore'))
104  fd.close()
105  except Exception, err:
106  logger.debug(">>> readJsonFile error, file name = " + str(fileName) + " | " + str(err))
107  if fd is not None:
108  fd.close()
109  return ret
110 
111 
112  # #Method schemaPredefined implements predefined schema algorithm
113  #
114  # @param inUrl - incoming url
115  # @param parametrs - incoming schema params
116  # @return processed url
117  def schemaPredefined(self, inUrl, parametrs):
118  # logger.debug('schemaPredefined enter parametrs: ' + str(parametrs))
119  for paramKey in parametrs:
120  macroName = '%' + paramKey + '%'
121  if inUrl.find(macroName) >= 0:
122 
123  paramList = []
124  frequencyList = []
125  timeList = []
126  elements = {}
127 
128  if self.indexStruct is not None and paramKey in self.indexStruct:
129  elements.update(self.indexStruct[paramKey])
130  # logger.debug('elements1: ' + str(elements))
131  for val in parametrs[paramKey]:
132  if val not in self.indexStruct[paramKey]:
133  elements.update({val:{"frequency":0, "time":0}})
134  self.indexStruct[paramKey].update(elements)
135  # logger.debug('elements3: ' + str(elements))
136  else:
137  for val in parametrs[paramKey]:
138  elements.update({val:{"frequency":0, "time":0}})
139  # logger.debug('elements2: ' + str(elements))
140  self.indexStruct = {paramKey:elements}
141 
142  for key, element in elements.items():
143  if "frequency" in element and "time" in element:
144  paramList.append(key)
145  frequencyList.append(int(element["frequency"]))
146  timeList.append(int(element["time"]))
147 
148  # logger.debug('key: ' + str(key))
149  # logger.debug('element: ' + str(element))
150 
151  mixIndex = frequencyList.index(min(frequencyList))
152  logger.debug('mixIndex: ' + str(mixIndex))
153  logger.debug('paramList[mixIndex]: ' + str(paramList[mixIndex]))
154 
155  logger.debug('>>> inUrl 1: ' + str(inUrl))
156  inUrl = unicode(inUrl.replace(macroName, paramList[mixIndex]))
157  logger.debug('>>> inUrl 2: ' + str(inUrl))
158  frequencyList[mixIndex] += 1
159  timeList[mixIndex] = int((datetime.now() - datetime.fromtimestamp(0)).total_seconds())
160 
161  # logger.debug('>>> self.indexStruct: ' + str(self.indexStruct))
162  # logger.debug('self.indexStruct[paramKey] 1: ' + str(self.indexStruct[paramKey]))
163  self.indexStruct[paramKey].update({paramList[mixIndex]:{"frequency":frequencyList[mixIndex], \
164  "time":timeList[mixIndex]}})
165  # logger.debug('self.indexStruct[paramKey] 2: ' + str(self.indexStruct[paramKey]))
166 
167  return inUrl
168 
169 
170  # #Method schemaIncrementalInt implements incremental int schema algorithm
171  #
172  # @param inUrl - incoming url
173  # @param parametrs - incoming schema params
174  # @param maxItems - count max items
175  # @return processed url
176  def schemaIncrementalInt(self, inUrl, parameters, maxItems):
177  logger.debug("schemaIncrementalInt() enter ... parameters: " + str(parameters) + "\ninUrl: " + str(inUrl) + \
178  "\nmaxItems: " + str(maxItems))
179  # variable for result
180  ret = []
181  # localRet = inUrl
182  for paramKey in parameters:
183  macroName = '%' + paramKey + '%'
184  if inUrl.find(macroName) >= 0:
185 # index = parameters[paramKey]["min"]
186 # logger.debug("index = " + str(index))
187 # if self.indexStruct is not None and paramKey in self.indexStruct:
188 # index = self.indexStruct[paramKey]
189 # logger.debug("index = " + str(index))
190 #
191 # if index >= parameters[paramKey]["min"] and index <= parameters[paramKey]["max"]:
192 # logger.debug("Before replace inUrl = " + str(inUrl))
193 # inUrl = inUrl.replace(macroName, str(index))
194 # logger.debug("After replace inUrl = " + str(inUrl))
195 # else:
196 # logger.debug("!!! continue !!!")
197 # continue
198 #
199 # logger.debug("self.indexStruct: " + varDump(self.indexStruct))
200 # if self.indexStruct is not None:
201 # logger.debug("Old index = " + str(index))
202 # index += parameters[paramKey]["step"]
203 # if index > parameters[paramKey]["max"]:
204 # index = parameters[paramKey]["min"]
205 # self.indexStruct[paramKey] = index
206 # logger.debug("New index = " + str(index))
207 
208  if maxItems > int(parameters[paramKey]["max"]):
209  ret = self.replaceSchemaIncrementalInt(inUrl,
210  macroName,
211  int(parameters[paramKey]["min"]),
212  int(parameters[paramKey]["max"]),
213  int(parameters[paramKey]["step"]))
214  else:
215  logger.debug("Start self.indexStruct: %s", varDump(self.indexStruct))
216  minPos = 0
217  if self.indexStruct is not None and paramKey in self.indexStruct:
218  minPos = int(self.indexStruct[paramKey])
219  logger.debug("minPos from structure = " + str(minPos))
220  else:
221  minPos = int(parameters[paramKey]["min"])
222 
223  nextPos = maxItems * int(parameters[paramKey]["step"]) + minPos
224  if nextPos >= int(parameters[paramKey]["max"]):
225  nextPos = int(parameters[paramKey]["max"])
226 
227  ret = self.replaceSchemaIncrementalInt(inUrl,
228  macroName,
229  minPos,
230  nextPos,
231  int(parameters[paramKey]["step"]))
232 
233  if self.indexStruct is None:
234  self.indexStruct = {}
235 
236  logger.debug("nextPos = " + str(nextPos))
237  if nextPos >= int(parameters[paramKey]["max"]):
238  nextPos = 0
239 
240  logger.debug("nextIndex after truncate = " + str(nextPos))
241  self.indexStruct[paramKey] = nextPos
242 
243  logger.debug("Finish self.indexStruct: %s", varDump(self.indexStruct))
244 
245  return ret
246 
247 
248  # # Method replaceSchemaIncrementalInt using for incremental int schema algorithm
249  #
250  # @param inUrl - incoming url
251  # @param macroName - macro name
252  # @param minPos - min pos
253  # @param minPos - max pos
254  # @param step - step
255  # @return processed url and last pos
256  def replaceSchemaIncrementalInt(self, inUrl, macroName, minPos, maxPos, step):
257  # variable for result
258  ret = []
259  localRet = inUrl
260  for x in range(minPos, maxPos, step):
261  localUrl = copy.copy(inUrl)
262  logger.debug("Before replace inUrl = " + str(localUrl))
263  localUrl = localUrl.replace(macroName, str(x))
264  logger.debug("After replace inUrl = " + str(localUrl))
265 
266  if localRet != localUrl and localUrl not in ret:
267  ret.append(localUrl)
268 
269  return ret
270 
271 
272  # #Method schemaRandomInt implements random int schema algorithm
273  #
274  # @param inUrl - incoming url
275  # @param parametrs - incoming schema params
276  # @return processed url
277  def schemaRandomInt(self, inUrl, parametrs):
278  for paramKey in parametrs:
279  macroName = '%' + paramKey + '%'
280  if inUrl.find(macroName) >= 0:
281  inUrl = inUrl.replace(macroName, str(random.randint(parametrs[paramKey]["min"], parametrs[paramKey]["max"])))
282  return inUrl
283 
284 
285  # #Method schemaRandomStr implements random string schema algorithm
286  #
287  # @param inUrl - incoming url
288  # @param parametrs - incoming schema params
289  # @return processed url
290  def schemaRandomStr(self, inUrl, parametrs):
291  lowAsciiSet = string.ascii_lowercase
292  hexdigitsSet = ''.join([ch for ch in string.hexdigits if not ch.isupper()])
293  for paramKey in parametrs:
294  macroName = '%' + paramKey + '%'
295  if inUrl.find(macroName) >= 0:
296  valueLen = random.randint(parametrs[paramKey]["min"], parametrs[paramKey]["max"])
297  valueStr = ''
298  for _ in xrange(0, valueLen):
299  if parametrs[paramKey]["chars"] == self.CHAR_ASCII_LATIN:
300  valueStr += lowAsciiSet[random.randint(0, len(lowAsciiSet) - 1)]
301  elif parametrs[paramKey]["chars"] == self.CHAR_HEXADECIMAL:
302  valueStr += hexdigitsSet[random.randint(0, len(hexdigitsSet) - 1)]
303  if parametrs[paramKey]["case"] == self.CHAR_LOWER:
304  valueStr = valueStr.lower()
305  elif parametrs[paramKey]["case"] == self.CHAR_UPPER:
306  valueStr = valueStr.upper()
307  inUrl = inUrl.replace(macroName, valueStr)
308  return inUrl
309 
310 
311  # #Method saveJsonInFile saves indexes structute into the file
312  #
313  # @param fileName - incoming file name
314  def saveJsonInFile(self, fileName):
315  if self.indexStruct is not None and len(self.indexStruct) > 0 and fileName is not None:
316  fd = None
317  try:
318  fd = open(fileName, "w")
319  if fd is not None:
320  fd.write(json.dumps(self.indexStruct, ensure_ascii=False))
321  fd.close()
322  except Exception, err:
323  ExceptionLog.handler(logger, err, ">>> saveJsonInFile error, file name = " + str(fileName))
324  if fd is not None:
325  fd.close()
326 
327 
328  # #Method resolveParametersByHTTP
329  #
330  # @param urls - list of external sources urls
331  # @param defaultValue - default for return value
332  # @return new parameters value, fetched by http
333  def resolveParametersByHTTP(self, urls, defaultValue=None):
334  if defaultValue is None:
335  ret = {}
336  else:
337  ret = defaultValue
338  newParams = None
339  for url in urls:
340  result = None
341  try:
342  result = requests.get(url)
343  except Exception as excp:
344  self.externalError = APP_CONSTS.ERROR_URLS_SCHEMA_EXTERNAL
345  logger.debug(">>> bad url request; url=" + url + ";err= " + str(excp))
346  if result is not None and result.status_code == 200 and result.text is not None:
347  try:
348  newParams = json.loads(result.text)
349  except Exception as excp:
350  self.externalError = APP_CONSTS.ERROR_URLS_SCHEMA_EXTERNAL
351  logger.debug(">>> bad external parameters json" + str(excp))
352  if newParams is not None:
353  ret = newParams
354  self.externalError = APP_CONSTS.ERROR_OK
355  break
356  return ret
357 
358 
359  # #Method resolveParametersByFormat
360  #
361  # @param parameters - input parameters for resolve
362  # @param delimiter - delimiter value using for split text
363  # @param formatValue - format field value
364  # @param defaultValue - default for return value
365  # @return new parameters value, fetched by 'format' url schema property
366  def resolveParametersByFormat(self, parameters, delimiter=' ', formatValue='json', defaultValue=None):
367  # variable for result
368  ret = defaultValue
369  logger.debug('!!! parameters: ' + str(parameters))
370 
371  if formatValue == 'plain-text':
372  for paramName in parameters:
373  logger.debug("paramName: '" + str(paramName) + "' type: " + str(type(paramName)))
374  logger.debug("paramValue: '" + str(parameters[paramName]) + "' type: " + str(type(parameters[paramName])))
375 
376  if isinstance(parameters[paramName], basestring):
377  if delimiter == "":
378  # split don't use delimiter
379  parameters[paramName] = unicode(parameters[paramName]).splitlines()
380  else:
381  # split use delimiter
382  parameters[paramName] = unicode(parameters[paramName]).split(delimiter)
383 
384  # remove empty strings from list
385  parameters[paramName] = [elem for elem in parameters[paramName] if elem]
386 
387  ret = parameters
388  logger.debug('!!! ret: ' + str(ret))
389 
390  elif formatValue == 'json':
391  ret = parameters
392  else:
393  logger.error("Unsupported format value: '" + str(formatValue) + "'")
394 
395  return ret
396 
397 
398  # #Method urlEncodeToParameters
399  #
400  # @param parameters - input parameters for resolve
401  # @param urlEncode - urlEncode flag value
402  # @return new parameters value, url encoded if neccesary
403  def urlEncodeToParameters(self, parameters, urlEncode):
404  # variable for result
405  ret = parameters
406  # logger.debug('>>>>> parameters: ' + str(parameters))
407 
408  if urlEncode is not None and int(urlEncode) > 0:
409  for paramName in parameters:
410  if isinstance(parameters[paramName], list) or isinstance(parameters[paramName], unicode):
411  paramsList = []
412  for elem in parameters[paramName]:
413  if isinstance(elem, str) or isinstance(elem, unicode):
414  try:
415  encodedStr = urllib.urlencode({'':elem})
416  if len(encodedStr) > 0 and encodedStr[0] == '=':
417  encodedStr = encodedStr[1:]
418  paramsList.append(encodedStr)
419  except Exception, err:
420  logger.debug("urlencode '" + str(elem) + "' has error: " + str(err))
421  paramsList.append(unicode(elem))
422 
423  parameters[paramName] = paramsList
424 
425  ret = parameters
426  # logger.debug('>>>>> ret: ' + str(ret))
427 
428  return ret
429 
430 
431  # #Method getMaxCountParameters
432  #
433  # @param parameters - input parameters
434  # @return max count of parameters list
435  def getMaxCountParameters(self, parameters):
436  countsList = [0]
437  for values in parameters.values():
438  # logger.debug('>>> values: ' + str(values))
439  if isinstance(values, list):
440  countsList.append(len(values))
441 
442  return max(countsList)
443 
444 
445  # #Method resolveParametersFromFile
446  #
447  # @param fileName - input file name
448  # @param default for return value
449  # @return new parameters value, fetched from file
450  def resolveParametersFromFile(self, fileName, defaultValue=None):
451  logger.debug(">>> resolveParametersFromFile enter fileName: " + str(fileName))
452  # variable for result
453  ret = defaultValue
454  parameters = {}
455 
456  if fileName.find(self.JSON_SUFF) == len(fileName) - len(self.JSON_SUFF):
457 
458  fd = None
459  try:
460  fd = open(fileName, "r")
461  if fd is not None:
462  buff = fd.read()
463  if len(buff) > 0 and buff[0] == '{': # maybe 'json'
464  parameters = json.loads(buff)
465  else:
466  parameters = {"":buff} # maybe 'plain-text'
467 
468  except Exception, err:
469  logger.debug(">>> resolveParametersFromFile error, file name = " + str(fileName) + " | " + str(err))
470  finally:
471  if fd is not None:
472  fd.close()
473 
474  if len(parameters) > 0:
475  ret = parameters
476  else:
477  logger.debug("Wrong file name: '" + str(fileName) + "', expected '<file_name>.json'")
478 
479  return ret
480 
481 
482  # #Method generateUrlSchema main class public point, whcih
483  #
484  # @param inUrl - incoming url
485  # @return processed url
486  def generateUrlSchema(self, inUrl):
487  ret = []
488  itemsLen = 1
489  if self.schema is not None:
490  try:
491  if "urls" in self.schema:
492  self.schema["parameters"] = self.resolveParametersByHTTP(self.schema["urls"])
493 
494  if "file_path" in self.schema:
495  self.schema["parameters"] = self.resolveParametersFromFile(self.schema["file_path"], \
496  self.schema["parameters"])
497 
498  # logger.debug('self.schema["parameters"]: ' + str(self.schema["parameters"]))
499 
500  if "format" in self.schema:
501  delimiter = ' '
502  if "delimiter" in self.schema:
503  delimiter = self.schema["delimiter"]
504  self.schema["parameters"] = self.resolveParametersByFormat(self.schema["parameters"], delimiter, \
505  self.schema["format"], \
506  self.schema["parameters"])
507 
508  if "url_encode" in self.schema:
509  self.schema["parameters"] = self.urlEncodeToParameters(self.schema["parameters"], self.schema["url_encode"])
510 
511  if "batch_insert" in self.schema and \
512  int(self.schema["batch_insert"]) >= self.BATCH_INSERT_MIN_ALLOWED_VALUE and \
513  int(self.schema["batch_insert"]) <= self.BATCH_INSERT_MAX_ALLOWED_VALUE:
514  self.batchInsert = int(self.schema["batch_insert"])
515 
516  # get max count parameters
517  maxCountParameters = self.getMaxCountParameters(self.schema["parameters"])
518  logger.debug('maxCountParameters: ' + str(maxCountParameters))
519 
520  if self.schema["mode"] == self.MODE_LIST_URLS:
521  itemsLen = int(self.schema["max_items"])
522  for _ in xrange(0, itemsLen):
523  localRet = inUrl
524  if self.schema["type"] == self.SCHEMA_PREDEFINED:
525  localRet = self.schemaPredefined(inUrl, self.schema["parameters"])
526 
527  elif self.schema["type"] == self.SCHEMA_INCREMENTAL_INT:
528  # localRet = self.schemaIncrementalInt(inUrl, self.schema["parameters"])
529  # get full urls list
530  ret = self.schemaIncrementalInt(inUrl, self.schema["parameters"], itemsLen)
531 
532  elif self.schema["type"] == self.SCHEMA_RANDOM_INT:
533  localRet = self.schemaRandomInt(inUrl, self.schema["parameters"])
534 
535  elif self.schema["type"] == self.SCHEMA_RANDOM_STR:
536  localRet = self.schemaRandomStr(inUrl, self.schema["parameters"])
537 
538  if localRet != inUrl and localRet not in ret:
539  ret.append(localRet)
540  if len(ret) >= int(maxCountParameters):
541  logger.debug('>>> break len(ret) = ' + str(len(ret)))
542  break
543  else:
544  break
545  except Exception as excp:
546  ExceptionLog.handler(logger, excp, ">>> generateUrlSchema has some error")
547  self.saveJsonInFile(self.indexFileName)
548  logger.debug(">>> urlSchema len = " + str(len(ret)))
549  return ret
def saveJsonInFile(self, fileName)
Definition: UrlSchema.py:314
def schemaRandomStr(self, inUrl, parametrs)
Definition: UrlSchema.py:290
def resolveParametersFromFile(self, fileName, defaultValue=None)
Definition: UrlSchema.py:450
def __init__(self, schema=None, siteId=None, urlSchemaDataDir=None)
Definition: UrlSchema.py:62
def resolveParametersByHTTP(self, urls, defaultValue=None)
Definition: UrlSchema.py:333
def schemaIncrementalInt(self, inUrl, parameters, maxItems)
Definition: UrlSchema.py:176
def urlEncodeToParameters(self, parameters, urlEncode)
Definition: UrlSchema.py:403
def generateUrlSchema(self, inUrl)
Definition: UrlSchema.py:486
def getMaxCountParameters(self, parameters)
Definition: UrlSchema.py:435
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410
def readJsonFile(self, fileName)
Definition: UrlSchema.py:97
def resolveParametersByFormat(self, parameters, delimiter=' ', formatValue='json', defaultValue=None)
Definition: UrlSchema.py:366
def schemaPredefined(self, inUrl, parametrs)
Definition: UrlSchema.py:117
Definition: join.py:1
def replaceSchemaIncrementalInt(self, inUrl, macroName, minPos, maxPos, step)
Definition: UrlSchema.py:256
def schemaRandomInt(self, inUrl, parametrs)
Definition: UrlSchema.py:277