HCE Project Python language Distributed Tasks Manager Application, Distributed Crawler Application and client API bindings.  2.0.0-chaika
Hierarchical Cluster Engine Python language binding
generate_sites_jsons.py
Go to the documentation of this file.
1 #!/usr/bin/python
2 
3 
4 """
5 HCE project, Python bindings, Distributed Tasks Manager application.
6 Event objects definitions.
7 
8 @package: dc
9 @file prepairer.py
10 @author Oleksii <developers.hce@gmail.com>
11 @link: http://hierarchical-cluster-engine.com/
12 @copyright: Copyright &copy; 2013-2014 IOIX Ukraine
13 @license: http://hierarchical-cluster-engine.com/license/
14 @since: 0.1
15 """
16 
17 import ppath
18 from ppath import sys
19 
20 import hashlib
21 import md5
22 import time
23 import json
24 import logging
25 from subprocess import Popen
26 from subprocess import PIPE
27 import MySQLdb as mdb
28 import MySQLdb.cursors
29 from contextlib import closing
30 
31 import pickle
32 import urllib
33 from urlparse import urlparse
34 from dc.EventObjects import Batch
35 from dc.EventObjects import BatchItem
36 from dc.EventObjects import Site
37 from dc.EventObjects import SiteUpdate
38 from dc.EventObjects import URL
39 from dc.EventObjects import URLUpdate
40 from dc.EventObjects import SiteFilter
41 
42 logging.basicConfig(filename="prepairer.log", filemode="w")
43 logger = logging.getLogger("Prepairer")
44 logger.setLevel("DEBUG")
45 
46 db_connector = None
47 dc_sites_db_connect = None
48 dc_urls_db_connect = None
49 
50 
51 site_templates_dic = {}
52 templates_dic = {}
53 
54 
56  query = "SELECT sites_urls.URL, sites_properties.`Value` FROM `sites_properties` INNER JOIN sites_urls ON sites_urls.Site_Id = sites_properties.Site_Id AND sites_properties.Name = 'template'"
57  print query
58  rows = executeQuery(dc_sites_db_connect, query)
59  return rows
60 
61 
62 def cutURL(url):
63  b = url
64  arr = None
65  a = urlparse(url).netloc.split(":")[0].split(".")
66  if len(a) > 2 and a[-3] != "www":
67  arr = a[-3:]
68  b = str(arr[-3] + "." + arr[-2] + "." + arr[-1])
69  else:
70  arr = a[-2:]
71  b = str(arr[-2] + "." + arr[-1])
72  return b
73 
74 
76  templates = readTemplatesFromMySQL()
77  print templates
78  for template in templates:
79  # print template
80  site_templates_dic[template["URL"]] = template["Value"]
81  with open("sites_templates_dic", "w") as f:
82  f.write(json.dumps(site_templates_dic))
83 
84 
86  global site_templates_dic
87  print site_templates_dic
88  for (key, value) in site_templates_dic.items():
89  url = cutURL(key)
90  md5 = hashlib.md5(url).hexdigest()
91  templates_dic[md5] = MySQLdb.escape_string(value)
92 
93 
95  global site_templates_dic
96  with open("sites_templates_dic", "r") as f:
97  site_templates_dic = json.loads(f.read())
98  print site_templates_dic
99 
100 
101 def executeQuery(db_connector, query):
102  try:
103  with closing(db_connector.cursor(MySQLdb.cursors.DictCursor)) as cursor:
104  cursor.execute(query)
105  db_connector.commit()
106  return cursor.fetchall()
107  except mdb.Error as err: # @todo logging in db_task
108  db_connector.rollback()
109  raise
110 
111 
113  global db_connector
114  global dc_sites_db_connect
115  global dc_urls_db_connect
116 
117  dbHost = "127.0.0.1"
118  dbPort = 3306
119  dbUser = "hce"
120  dbPWD = "hce12345"
121 
122  db_dc_sites = "dc_sites"
123  db_dc_urls = "dc_urls"
124 
125  dc_sites_db_connect = mdb.connect(dbHost, dbUser, dbPWD, db_dc_sites, dbPort)
126  dc_urls_db_connect = mdb.connect(dbHost, dbUser, dbPWD, db_dc_urls, dbPort)
127 
128 
129 def createSiteObj(input_url):
130 
131  # strip input url
132  input_url = input_url.strip()
133 
134  # root url
135  root_url = input_url
136 
137  # get url for md5
138  norm_url = cutURL(input_url)
139  # norm_url = input_url
140 
141  # create site
142  site = Site(norm_url)
143  site.urls = [input_url]
144  # create site filters
145  # site_filter_pattern = ".*" + norm_url + ".*"
146  site_filter_pattern = ".*" + cutURL(input_url) + ".*"
147  site_filters = SiteFilter(site.id, site_filter_pattern)
148 
149 
150  # create site properties templates
151  print templates_dic
152  print site.id
153  if site.id in templates_dic:
154  site.properties["template"] = templates_dic[site.id]
155 
156  # fill site
157  # site.urls = [root_url]
158  # site.urls = []
159  site.filters = [site_filters]
160  site.maxResources = 100000
161  site.maxURLs = 100000
162  site.maxErrors = 100000
163  site.maxResourceSize = 1000000
164  # site.state = Site.STATE_SUSPENDED
165  # site.filters = [SiteFilter(site.id, "(.*)")]
166  return site
167 
168 
169 def addSite(site):
170  file_name = "site_" + str(site.id) + ".json"
171  open(file_name, "w").write(site.toJSON())
172 
173 
174 
175 if __name__ == "__main__":
176  loadDBBackend()
177  # generateTemplates()
179  fillTemplates()
180  for input_url in sys.stdin:
181  site = createSiteObj(input_url)
182  addSite(site)
def executeQuery(db_connector, query)