5 HCE project, Python bindings, Distributed Tasks Manager application. 6 Event objects definitions. 9 @file batch_generator.py 10 @author Oleksii <developers.hce@gmail.com> 11 @link: http://hierarchical-cluster-engine.com/ 12 @copyright: Copyright © 2013-2014 IOIX Ukraine 13 @license: http://hierarchical-cluster-engine.com/license/ 33 LOGGER_NAME =
"batch_generator" 35 MSG_ERROR_READ_BATCH =
"ERROR READ BATCH FROM STDIN" 38 logging.basicConfig(filename=
"../log/batch_generator.log", filemode=
"w")
39 logger = logging.getLogger(LOGGER_NAME)
40 logger.setLevel(
"DEBUG")
43 if __name__ ==
"__main__":
45 input_json = sys.stdin.read()
46 batch_data = json.loads(input_json)
47 site_id = batch_data[
"site_id"]
48 urls = batch_data[
"urls"]
49 tags = batch_data[
"tags"]
51 logger.debug(
"id: <<%s>>, site_id: <<%s>>, urls: <<%s>>, tags: <<%s>>" % (id, site_id, urls, tags))
55 item_url = item[
"url"]
58 item_site_id = item[
"site_id"]
59 logger.debug(
"URL #%s: url: <<%s>>, site_id: <<%s>>" % (item_no, item_url, item_site_id ))
61 sid = item_site_id
or site_id
62 uid = hashlib.md5(item_url).hexdigest()
63 url_obj = dc_event.URL(sid, item_url)
64 batch_item = dc_event.BatchItem(sid, uid, url_obj)
65 batch_items.append(batch_item)
66 batch_obj = dc_event.Batch(id, batch_items, dc_event.Batch.TYPE_REAL_TIME_CRAWLER)
67 logger.debug(
"BATCH: %s" %
varDump(batch_obj))
68 print pickle.dumps(batch_obj)
def varDump(obj, stringify=True, strTypeMaxLen=256, strTypeCutSuffix='...', stringifyType=1, ignoreErrors=False, objectsHash=None, depth=0, indent=2, ensure_ascii=False, maxDepth=10)