HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
batch_generator.py
Go to the documentation of this file.
1 #!/usr/bin/python
2 
3 
4 """
5  HCE project, Python bindings, Distributed Tasks Manager application.
6  Event objects definitions.
7 
8  @package: dc
9  @file batch_generator.py
10  @author Oleksii <developers.hce@gmail.com>
11  @link: http://hierarchical-cluster-engine.com/
12  @copyright: Copyright &copy; 2013-2014 IOIX Ukraine
13  @license: http://hierarchical-cluster-engine.com/license/
14  @since: 0.1
15  """
16 
17 
18 import ppath
19 import os
20 import sys
21 import json
22 import pickle
23 import hashlib
24 import logging
25 
26 from app.Utils import varDump
27 import dc.EventObjects as dc_event
28 
29 
30 EXIT_SUCCESS = 0
31 EXIT_FAILURE = 1
32 
33 LOGGER_NAME = "batch_generator"
34 
35 MSG_ERROR_READ_BATCH = "ERROR READ BATCH FROM STDIN"
36 
37 
38 logging.basicConfig(filename="../log/batch_generator.log", filemode="w")
39 logger = logging.getLogger(LOGGER_NAME)
40 logger.setLevel("DEBUG")
41 
42 
43 if __name__ == "__main__":
44  error = EXIT_SUCCESS
45  input_json = sys.stdin.read()
46  batch_data = json.loads(input_json)
47  site_id = batch_data["site_id"]
48  urls = batch_data["urls"]
49  tags = batch_data["tags"]
50  id = batch_data["id"]
51  logger.debug("id: <<%s>>, site_id: <<%s>>, urls: <<%s>>, tags: <<%s>>" % (id, site_id, urls, tags))
52  item_no = 1
53  batch_items = []
54  for item in urls:
55  item_url = item["url"]
56  item_site_id = None
57  if "site_id" in item:
58  item_site_id = item["site_id"]
59  logger.debug("URL #%s: url: <<%s>>, site_id: <<%s>>" % (item_no, item_url, item_site_id ))
60  item_no = item_no + 1
61  sid = item_site_id or site_id
62  uid = hashlib.md5(item_url).hexdigest()
63  url_obj = dc_event.URL(sid, item_url)
64  batch_item = dc_event.BatchItem(sid, uid, url_obj)
65  batch_items.append(batch_item)
66  batch_obj = dc_event.Batch(id, batch_items, dc_event.Batch.TYPE_REAL_TIME_CRAWLER)
67  logger.debug("BATCH: %s" % varDump(batch_obj))
68  print pickle.dumps(batch_obj)
69  sys.stdout.flush()
70  os._exit(error)
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)
Definition: Utils.py:410